# Make dataset for second type of model (numerical dynasty)

**Motivation:**  
This script merges **titles for feature version 2** with additional features: `dynasty`, `father_was_vizier`, and `PCA` components.

**Workflow:**
- Load dataframes with:
    - **titles for feature version 2**
    - `dynasty`
    - `father_was_vizier`
    - `PCA`
- Merge them  
- Save the result


In [1]:
import os
import numpy as np
import pandas as pd

import set_path
from supp.support_load import read_csv, read_excel
from supp.support_merge import merge, group_to_list
from supp.support_save import save_df
from supp.support_constants import VIZIER_IDS, PATH_DATA_MERGED

In [2]:
# load df with titles
df_titles_v2 = read_csv('df_vizier_titles_v2_only_titles')
print(df_titles_v2.shape)
df_titles_v2.head()

CSV file loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_titles_v2_only_titles.csv
(3930, 51)


Unnamed: 0,ID_person,vizier,mniw Nxn / zAw Nxn,imy iz Nxn,Hry-cStA,HAty-a,Xry-Hbt,cm / ctm,iwn knmwt,zA ncwt n Xt.f cmcw,...,xrp iAwt nbwt nTrwt,imy-rA zS(w) a(w) (nw) ncwt,Xry-tp ncwt,mDH zS(w) ncwt,xtm(ty)-bity,imy iz,mdw rxyt,imAxw xr Wcir,mDH ncwt qd(w) m prwy,aD-mr (n) zAb
0,322,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,323,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,324,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,325,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,326,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# laod df with father_was_vizier
df_father_was_vizier = read_csv('df_father_was_vizier')
print(df_father_was_vizier.shape)
df_father_was_vizier.head()

CSV file loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_father_was_vizier.csv
(4962, 2)


Unnamed: 0,ID_person,father_was_vizier
0,322,0
1,323,0
2,324,0
3,325,0
4,326,0


In [4]:
# load df with dynasty
df_dynasty_numeric = read_csv('df_dynasty_numeric')
print(df_dynasty_numeric.shape)
df_dynasty_numeric.head()

CSV file loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_dynasty_numeric.csv
(4840, 2)


Unnamed: 0,ID_person,dyn_num
0,1,5.5
1,2,5.75
2,4,5.5
3,5,5.5
4,6,5.5


In [5]:
# load df with PCA
df_titles_pca = read_csv('df_titles_pca_3D')
df_titles_pca= df_titles_pca[['ID_person' ,'PC1', 'PC2', 'PC3']]
print(df_titles_pca.shape)
df_titles_pca.head()

CSV file loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_titles_pca_3D.csv
(3930, 4)


Unnamed: 0,ID_person,PC1,PC2,PC3
0,322,-0.775583,-0.26159,-0.035094
1,323,-0.375136,-0.465387,0.958193
2,324,-1.025342,-0.244598,-0.112258
3,325,-0.778289,-0.273947,-0.088437
4,326,-0.778289,-0.273947,-0.088437


### merge

In [6]:
df = df_titles_v2.copy()
print(f'{df.shape} original shape')
df = df.merge(df_father_was_vizier, on='ID_person', how='left')
print(f'{df.shape} shape after merge df_father_was_vizier')
df = df.merge(df_dynasty_numeric, on='ID_person', how='left')
print(f'{df.shape} shape after merge df_dynasty_numeric')
df = df.merge(df_titles_pca, on='ID_person', how='left')
print(f'{df.shape} shape after merge df_titles_pca')
df.head()

(3930, 51) original shape
(3930, 52) shape after merge df_father_was_vizier
(3930, 53) shape after merge df_dynasty_numeric
(3930, 56) shape after merge df_titles_pca


Unnamed: 0,ID_person,vizier,mniw Nxn / zAw Nxn,imy iz Nxn,Hry-cStA,HAty-a,Xry-Hbt,cm / ctm,iwn knmwt,zA ncwt n Xt.f cmcw,...,imy iz,mdw rxyt,imAxw xr Wcir,mDH ncwt qd(w) m prwy,aD-mr (n) zAb,father_was_vizier,dyn_num,PC1,PC2,PC3
0,322,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.5,-0.775583,-0.26159,-0.035094
1,323,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.75,-0.375136,-0.465387,0.958193
2,324,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.75,-1.025342,-0.244598,-0.112258
3,325,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.75,-0.778289,-0.273947,-0.088437
4,326,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.75,-0.778289,-0.273947,-0.088437


### removes persons with nan dynasty

In [7]:
# all nan values in result df
df.isna().sum().sum()

81

In [8]:
# nan values in column dyn_num
df['dyn_num'].isna().sum()

81

In [9]:
# have all viziers its dynasty?
df.loc[df['dyn_num'].isna(), 'vizier'].sum()

0

**NOTE:**
- Only 81 non viziers have uknown dynasty.
- They will be removed.

In [10]:
# drop persons without known dynasty
print(f'{df.shape} shape original')
df = df.loc[~df['dyn_num'].isna(), :]
print(f'{df.shape} shape after removed missing dynasty')

(3930, 56) shape original
(3849, 56) shape after removed missing dynasty


In [11]:
# chekc of nan values
df.isna().sum().sum()

0

### save df

In [12]:
save_df(df, 'df_vizier_titles_v2_dynasty_numerical')

print(f'{df.shape}\tshape of df')
df.head()

Dataframe saved into C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_titles_v2_dynasty_numerical.csv
(3849, 56)	shape of df


Unnamed: 0,ID_person,vizier,mniw Nxn / zAw Nxn,imy iz Nxn,Hry-cStA,HAty-a,Xry-Hbt,cm / ctm,iwn knmwt,zA ncwt n Xt.f cmcw,...,imy iz,mdw rxyt,imAxw xr Wcir,mDH ncwt qd(w) m prwy,aD-mr (n) zAb,father_was_vizier,dyn_num,PC1,PC2,PC3
0,322,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.5,-0.775583,-0.26159,-0.035094
1,323,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.75,-0.375136,-0.465387,0.958193
2,324,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.75,-1.025342,-0.244598,-0.112258
3,325,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.75,-0.778289,-0.273947,-0.088437
4,326,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.75,-0.778289,-0.273947,-0.088437
