# Split final dataset into train, validation and test set

**Motivation:** Having prepared dataframes with respose and all predictores (version 1 and 2). This script aims to plit it into three distinct dataframes - train, validation, test

In [1]:
path = r''
file_name = r'analyze_split_statistics_dynasty_numerical.csv'

### Loads

In [2]:
import os
import sys
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import sklearn

from sklearn.model_selection import StratifiedShuffleSplit

# local
import set_path
import supp.support_load as sl
from supp.support_save import save_df

In [3]:
# print version of used packages
print("scikit-learn:", sklearn.__version__)
print("numpy:", np.__version__)
print("python:", sys.version)

scikit-learn: 1.4.2
numpy: 1.26.4
python: 3.11.8 | packaged by Anaconda, Inc. | (main, Feb 26 2024, 21:34:05) [MSC v.1916 64 bit (AMD64)]


In [4]:
# load database
dfs, dfs_name, dfs_export_date = sl.load_pickle()
iton, ntoi = sl.get_name_dicts(dfs_name)
dfs_export_date

Pickle database loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\dfs_complete.pickle
Applied preprocessing: merge_on_jones
Applied preprocessing: remove_jones_duplicates


'2024-10-11'

In [5]:
# load feature version 1
df_v1 = sl.read_csv('df_vizier_titles_v1')
print(df_v1.shape)
df_v1.head(2)

CSV file loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_titles_v1.csv
(3930, 46)


Unnamed: 0,ID_person,vizier,[imAxw xr] nTr aA,HAty-a,imAxw xr Wcir,imy-rA wabty,iry-pat,r P nb,mniw Nxn / zAw Nxn,imy-rA izwy (n) Xkr(w) ncwt,...,Xry-Hbt Hry-tp,xrp iAwt nbwt nTrwt,zS mDAt-nTr,Hry-tp Nxb,Htc(?) Inpw,imy iz Nxn,Hry-cStA n pr-dwAt,mdw rxyt,zA ncwt,imy-rA 5maw
0,322,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,323,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# load feature version 2
df_v2 = sl.read_csv('df_vizier_titles_v2_dynasty_numerical')
print(df_v2.shape)
df_v2.head(2)

CSV file loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_titles_v2_dynasty_numerical.csv
(3849, 56)


Unnamed: 0,ID_person,vizier,mniw Nxn / zAw Nxn,imy iz Nxn,Hry-cStA,HAty-a,Xry-Hbt,cm / ctm,iwn knmwt,zA ncwt n Xt.f cmcw,...,imy iz,mdw rxyt,imAxw xr Wcir,mDH ncwt qd(w) m prwy,aD-mr (n) zAb,father_was_vizier,dyn_num,PC1,PC2,PC3
0,322,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.5,-0.775583,-0.26159,-0.035094
1,323,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,5.75,-0.375136,-0.465387,0.958193


In [7]:
# get common ids
person_ids_v1 = df_v1['ID_person'].to_list()
person_ids_v2 = df_v2['ID_person'].to_list()
common_ids = list(set(person_ids_v1).intersection(set(person_ids_v2)))
print(f'{len(common_ids)}\tcount of common_ids')

3849	count of common_ids


In [8]:
# get common features
features_v1 = df_v1.columns.to_list()
features_v2 = df_v2.columns.to_list()
common_features = list(set(features_v1).intersection(set(features_v2)))
all_features = list(set(features_v1).union(set(features_v2)))
print(f'{len(common_features)}\tcount of common_features')
print(f'{len(all_features)}\tcount of all_features')

46	count of common_features
56	count of all_features


In [9]:
# make one common dataframe
df_all = pd.merge(df_v1, df_v2, on=common_features, how='outer')
print(f'{df_all.shape}\tshape after merge')
df_all = df_all.loc[df_all['ID_person'].isin(common_ids), :]
print(f'{df_all.shape}\tshape after restricting to same persons')
df_all= df_all.set_index('ID_person')
print(f'{df_all.shape}\tshape after set_index to ID_person')
df_all.head(2)

(3930, 56)	shape after merge
(3849, 56)	shape after restricting to same persons
(3849, 55)	shape after set_index to ID_person


Unnamed: 0_level_0,vizier,[imAxw xr] nTr aA,HAty-a,imAxw xr Wcir,imy-rA wabty,iry-pat,r P nb,mniw Nxn / zAw Nxn,imy-rA izwy (n) Xkr(w) ncwt,wr 5 (m) pr 9Hwty,...,aD-mr 8p,wt(y) Inpw,xrp (i)m(yw) nTrw,mDH zS(w) ncwt,mDH ncwt qd(w) m prwy,father_was_vizier,dyn_num,PC1,PC2,PC3
ID_person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
322,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.5,-0.775583,-0.26159,-0.035094
323,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.75,-0.375136,-0.465387,0.958193


#### missings in version 1

In [10]:
# missings in version 1
missing_in_v1 = list(set(person_ids_v2) - set(person_ids_v1))
df_missing_in_v1 = df_v2[df_v2['ID_person'].isin(missing_in_v1)]
print(f'{len(missing_in_v1)}\tcount of missing_in_v1')
print(missing_in_v1)
df_missing_in_v1

0	count of missing_in_v1
[]


Unnamed: 0,ID_person,vizier,mniw Nxn / zAw Nxn,imy iz Nxn,Hry-cStA,HAty-a,Xry-Hbt,cm / ctm,iwn knmwt,zA ncwt n Xt.f cmcw,...,imy iz,mdw rxyt,imAxw xr Wcir,mDH ncwt qd(w) m prwy,aD-mr (n) zAb,father_was_vizier,dyn_num,PC1,PC2,PC3


#### missings in version 2

In [11]:
# missings in version 2
missing_in_v2 = list(set(person_ids_v1) - set(person_ids_v2))
df_missing_in_v2 = df_v1[df_v1['ID_person'].isin(missing_in_v2)]
print(f'{len(missing_in_v2)}\tcount of missing_in_v2')
print(missing_in_v2)
df_missing_in_v2

81	count of missing_in_v2
[519, 4619, 4621, 4622, 4623, 4632, 4634, 4635, 4640, 4147, 4162, 4704, 4729, 4730, 4731, 4732, 4223, 4746, 4235, 4748, 4749, 4750, 4751, 4752, 4754, 4763, 696, 4281, 212, 213, 1754, 1755, 1756, 1757, 1759, 3308, 4844, 3310, 3311, 3314, 263, 4365, 3855, 4884, 4885, 4375, 4376, 4377, 4887, 4380, 4382, 4383, 4384, 4389, 4391, 4393, 4394, 4395, 4920, 4924, 4925, 4926, 4927, 3906, 4959, 4468, 3960, 4491, 4492, 4493, 4516, 2991, 4539, 4543, 4544, 4545, 4546, 4547, 4571, 4066, 3048]


Unnamed: 0,ID_person,vizier,[imAxw xr] nTr aA,HAty-a,imAxw xr Wcir,imy-rA wabty,iry-pat,r P nb,mniw Nxn / zAw Nxn,imy-rA izwy (n) Xkr(w) ncwt,...,Xry-Hbt Hry-tp,xrp iAwt nbwt nTrwt,zS mDAt-nTr,Hry-tp Nxb,Htc(?) Inpw,imy iz Nxn,Hry-cStA n pr-dwAt,mdw rxyt,zA ncwt,imy-rA 5maw
212,519,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
401,696,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1354,1754,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1355,1755,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1356,1756,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3899,4924,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3900,4925,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3901,4926,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3902,4927,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# overview of omitted persons
df_missing_in_v2_overview = df_missing_in_v2.copy()
df_missing_in_v2_overview = df_missing_in_v2_overview.set_index('ID_person')
df_missing_in_v2_overview['total_titles_count'] = df_missing_in_v2_overview.sum(axis=1)
df_missing_in_v2_overview[['total_titles_count']].sort_values('total_titles_count', ascending=False)[:20]

Unnamed: 0_level_0,total_titles_count
ID_person,Unnamed: 1_level_1
4623,6.0
4235,5.0
4920,5.0
4516,4.0
4493,4.0
4632,3.0
3048,3.0
4730,2.0
4640,2.0
4545,2.0


**Conclusion:** Omitted perosons had max 6 titles -> its low number

### define functions

In [13]:
def make_split(data, random_state):
    # Set response
    response = 'vizier'
    features_all = [col for col in data.columns if col != response]
    
    # Separating features and target
    X = data[features_all].values
    y = data[response].values
    
    # Stratified splitting for train, validation, and test sets
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=random_state)
    train_index, temp_index = next(sss.split(X, y))
    X_train, y_train = X[train_index], y[train_index]
    X_temp, y_temp = X[temp_index], y[temp_index]
    
    sss_temp = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
    val_index, test_index = next(sss_temp.split(X_temp, y_temp))
    
    X_val, y_val = X_temp[val_index], y_temp[val_index]
    X_test, y_test = X_temp[test_index], y_temp[test_index]

    # Split pandas dataframe
    data_train = data.iloc[train_index, :]
    data_val = data.iloc[temp_index, :].iloc[val_index, :]
    data_test = data.iloc[temp_index, :].iloc[test_index, :]

    return data_train, data_val, data_test

In [14]:
def get_split_stats(data, data_train, data_val, data_test):
    columns = data.columns.to_list()
    df_stat = pd.DataFrame(index=df_all.columns)
    # get counts
    df_stat['count_all'] = data.sum(axis=0)
    df_stat['count_train'] = data_train.sum(axis=0)
    df_stat['count_val'] = data_val.sum(axis=0)
    df_stat['count_test'] = data_test.sum(axis=0)
    # get means
    df_stat['mean_all'] = data.mean(axis=0)
    df_stat['mean_train'] = data_train.mean(axis=0)
    df_stat['mean_val'] = data_val.mean(axis=0)
    df_stat['mean_test'] = data_test.mean(axis=0)
    # get mean ratios
    df_stat['ratio_train'] = df_stat['mean_train'] / df_stat['mean_all']
    df_stat['ratio_val'] = df_stat['mean_val'] / df_stat['mean_all']
    df_stat['ratio_test'] = df_stat['mean_test'] / df_stat['mean_all']
    # get std
    df_stat['std_all'] = data.std(axis=0)
    df_stat['std_train'] = data_train.std(axis=0)
    df_stat['std_val'] = data_val.std(axis=0)
    df_stat['std_test'] = data_test.std(axis=0)
    # get std ratios
    df_stat['ratio_std_train'] = df_stat['std_train'] / df_stat['std_all']
    df_stat['ratio_std_val'] = df_stat['std_val'] / df_stat['std_all']
    df_stat['ratio_std_test'] = df_stat['std_test'] / df_stat['std_all']

    return df_stat

In [15]:
def analyze_split(df_stat, threshold):
    columns_ratio = ['ratio_train', 'ratio_val', 'ratio_test']
    continuous_feature = ['dyn_num', 'PC1', 'PC2', 'PC3']
    
    # binary features
    df_help = df_stat.loc[~df_stat.index.isin(continuous_feature), columns_ratio]
    mask = (df_help-1).abs()>threshold
    count_issue = mask.sum(axis=0)
    max_vals = df_help.max(axis=0)
    min_vals = df_help.min(axis=0)
    # continuous feature
    df_help = df_stat.loc[df_stat.index.isin(continuous_feature), columns_ratio]
    mask = (df_help-1).abs()>threshold
    count_issue_con = mask.sum(axis=0)
    max_vals_con = df_help.max(axis=0)
    min_vals_con = df_help.min(axis=0)

    # rename binary features
    count_issue.index = ['cnt_issue_train', 'cnt_issue_val', 'cnt_issue_test']
    max_vals.index = ['max_train', 'max_val', 'max_test']
    min_vals.index = ['min_train', 'min_val', 'min_test']

    # rename continuous feature
    count_issue_con.index = ['cnt_issue_train_con', 'cnt_issue_val_con', 'cnt_issue_test_con']
    max_vals_con.index = ['max_train_con', 'max_val_con', 'max_test_con']
    min_vals_con.index = ['min_train_con', 'min_val_con', 'min_test_con']

    # concat each stats
    result = pd.concat([count_issue, max_vals, min_vals,
                        count_issue_con, max_vals_con, min_vals_con], ignore_index=False)

    return result

In [16]:
def save_split_stat(stats, random_state, threshold):
    # File path
    file_path = os.path.join(path, file_name)
    
    # Convert Series to a one-row DataFrame
    rs = pd.Series({'random_state': random_state, 'threshold': threshold})
    s = pd.concat([rs, stats], ignore_index=False)
    # Prepend
    df_row = pd.DataFrame([s])
    
    # Check if file exists
    if not os.path.isfile(file_path):
        # File doesn't exist: create it with header
        df_row.to_csv(file_path, index=False)
    else:
        # File exists: append without header
        df_row.to_csv(file_path, mode='a', header=False, index=False)

### make multiple splits

In [17]:
# threshold = 0.5
# random_state_start = pd.read_csv(os.path.join(path, file_name))['random_state'].astype(int).max() + 1

In [18]:
# for random_state in range(random_state_start, 4294967295):
#     df_train, df_val, df_test = make_split(df_all, random_state)
#     df_stat = get_split_stats(df_all, df_train, df_val, df_test)
#     stats = analyze_split(df_stat, threshold)
#     save_split_stat(stats, random_state, threshold)
#     if random_state % 100 == 0:
#         print(f"random_state {random_state} finished")

### analyze splits

In [19]:
df_splits = pd.read_csv(os.path.join(path, file_name))
print(df_splits.shape)
df_splits.head(2)

(2020772, 20)


Unnamed: 0,random_state,threshold,cnt_issue_train,cnt_issue_val,cnt_issue_test,max_train,max_val,max_test,min_train,min_val,min_test,cnt_issue_train_con,cnt_issue_val_con,cnt_issue_test_con,max_train_con,max_val_con,max_test_con,min_train_con,min_val_con,min_test_con
0,0.0,0.5,0.0,11.0,15.0,1.326678,2.382397,2.421516,0.649423,0.0,0.0,3.0,2.0,3.0,3.046661,1.443271,1.00616,0.999164,-1.405546,-8.981784
1,1.0,0.5,0.0,9.0,9.0,1.428731,1.600971,1.90262,0.810901,0.0,0.0,3.0,3.0,3.0,9.891833,0.997808,10.734861,1.000875,-49.833342,-5.047541


In [20]:
threshold_lower = 0.5
threshold_upper = 1.9
mask1 = df_splits['min_train'] > threshold_lower
mask2 = df_splits['min_val'] > threshold_lower
mask3 = df_splits['min_test'] > threshold_lower
mask4 = df_splits['max_train'] < threshold_upper
mask5 = df_splits['max_val'] < threshold_upper
mask6 = df_splits['max_test'] < threshold_upper
mask = mask1 & mask2 & mask3 & mask4 & mask5 & mask6
df = df_splits.loc[mask, :]
print(df.shape)
df

(2, 20)


Unnamed: 0,random_state,threshold,cnt_issue_train,cnt_issue_val,cnt_issue_test,max_train,max_val,max_test,min_train,min_val,min_test,cnt_issue_train_con,cnt_issue_val_con,cnt_issue_test_con,max_train_con,max_val_con,max_test_con,min_train_con,min_val_con,min_test_con
127523,127523.0,0.5,0.0,2.0,6.0,1.208926,1.667678,1.664792,0.714365,0.500303,0.512244,2.0,3.0,3.0,1.818272,27.133952,65.4119,-18.416989,-6.068679,-5.482295
396530,396530.0,0.5,0.0,6.0,6.0,1.071548,1.795961,1.816137,0.756387,0.606428,0.512244,2.0,3.0,3.0,1.270341,29.658425,13.253284,-7.767004,-3.773871,-1.781551


### make final split

In [21]:
# define number of the best split
threshold = 0.5
random_state = 127523

In [22]:
# perform split according to the best random_state
df_train, df_val, df_test = make_split(df_all, random_state)
df_stat = get_split_stats(df_all, df_train, df_val, df_test)
stats = analyze_split(df_stat, threshold)
pd.DataFrame(stats).transpose()

Unnamed: 0,cnt_issue_train,cnt_issue_val,cnt_issue_test,max_train,max_val,max_test,min_train,min_val,min_test,cnt_issue_train_con,cnt_issue_val_con,cnt_issue_test_con,max_train_con,max_val_con,max_test_con,min_train_con,min_val_con,min_test_con
0,0.0,2.0,6.0,1.208926,1.667678,1.664792,0.714365,0.500303,0.512244,2.0,3.0,3.0,1.818272,27.133952,65.4119,-18.416989,-6.068679,-5.482295


In [23]:
# check slits dimensions
print(f'DIMENSION CHECK')
print(f'{df_train.shape}\tshape of df_train')
print(f'{df_val.shape}\tshape of df_val')
print(f'{df_test.shape}\tshape of df_test')

# check slits dimensions
print(f'\nVIZIER COUNT CHECK')
print(f'{df_train["vizier"].sum()}\tcount of viziers in df_train')
print(f'{df_val["vizier"].sum()}\tcount of viziers in df_val')
print(f'{df_test["vizier"].sum()}\tcount of viziers in df_test')

# check slits dimensions
print(f'\nDISTICT ID_person CHECK')
id_train = set(df_train.index.to_list())
id_val = set(df_val.index.to_list())
id_test = set(df_test.index.to_list())
print(f'{id_train.intersection(id_val)}\tintersection of id_train and id_val')
print(f'{id_train.intersection(id_test)}\tintersection of id_train and id_test')
print(f'{id_val.intersection(id_test)}\tintersection of id_val and id_test')

DIMENSION CHECK
(2694, 55)	shape of df_train
(577, 55)	shape of df_val
(578, 55)	shape of df_test

VIZIER COUNT CHECK
56	count of viziers in df_train
12	count of viziers in df_val
12	count of viziers in df_test

DISTICT ID_person CHECK
set()	intersection of id_train and id_val
set()	intersection of id_train and id_test
set()	intersection of id_val and id_test


In [24]:
# print problematic features
threshold_low = 0.8
threshold_up =  1.4
columns = ['count_all', 'count_train', 'count_val', 'count_test', 'ratio_train', 'ratio_val', 'ratio_test']
mask_1 = (df_stat['ratio_train']<threshold_low) | (df_stat['ratio_train']>threshold_up)
mask_2 = (df_stat['ratio_val']<threshold_low)   | (df_stat['ratio_val']>threshold_up)
mask_3 = (df_stat['ratio_test']<threshold_low)  | (df_stat['ratio_test']>threshold_up)
mask = mask_1 | mask_2 | mask_3
df_stat.loc[mask, columns]

Unnamed: 0,count_all,count_train,count_val,count_test,ratio_train,ratio_val,ratio_test
HAty-a,95.0,71.0,11.0,13.0,1.067788,0.772398,0.911255
r P nb,23.0,14.0,4.0,5.0,0.869662,1.160124,1.447646
imy-rA izwy (n) Xkr(w) ncwt,21.0,13.0,3.0,5.0,0.884452,0.952959,1.585517
wr 5 (m) pr 9Hwty,17.0,11.0,2.0,4.0,0.924473,0.784789,1.566863
Xry-Hbt,201.0,144.0,34.0,23.0,1.023568,1.128379,0.761995
imy-rA prwy-nbw,20.0,10.0,5.0,5.0,0.714365,1.667678,1.664792
cmcw cnwt,19.0,13.0,2.0,4.0,0.977552,0.70218,1.40193
iwn knmwt,61.0,43.0,11.0,7.0,1.007138,1.202915,0.764167
zA ncwt n Xt.f,31.0,23.0,3.0,5.0,1.060026,0.645553,1.07406
imy iz,23.0,18.0,3.0,2.0,1.118137,0.870093,0.579058


### data postprocessing

#### titles with high correlation with vizier title

In [25]:
non_title_features = [
 'vizier',
 'father_was_vizier',
 'dyn_num',
 'PC1',
 'PC2',
 'PC3'
]
title_all = [col for col in df_all.columns if col not in non_title_features]

In [26]:
df_all_stat = pd.DataFrame([], index=title_all)
df_all_stat['count'] = df_all.loc[:, title_all].sum(axis=0)
df_all_stat['count_vizier'] = df_all.loc[df_all['vizier']==1, title_all].sum(axis=0)
df_all_stat['count_non_vizier'] = df_all.loc[df_all['vizier']!=1, title_all].sum(axis=0)
df_all_stat['ratio'] = (df_all_stat['count_vizier'] / df_all_stat['count']).round(4)
df_all_stat = df_all_stat.sort_values('ratio', ascending=False)
df_all_stat.head()

Unnamed: 0,count,count_vizier,count_non_vizier,ratio
imy-rA Hwt-wrt 6,18.0,17.0,1.0,0.9444
wr 5 (m) pr 9Hwty,17.0,15.0,2.0,0.8824
xrp iAwt nbwt nTrwt,17.0,14.0,3.0,0.8235
aA 8wAw,20.0,16.0,4.0,0.8
Htc(?) Inpw,14.0,11.0,3.0,0.7857


**NOTE:**
- "imy-rA Hwt-wrt 6" has higt correlation with vizier.
- It will be removed

In [27]:
title_name = 'imy-rA Hwt-wrt 6'
print(f'{df_all.shape}\tshape of df_all before removal')
df_all = df_all[[col for col in df_all.columns if col!=title_name]]
print(f'{df_all.shape}\tshape of df_all after removal')

(3849, 55)	shape of df_all before removal
(3849, 54)	shape of df_all after removal


In [28]:
title_name = 'PC3'
print(f'{df_all.shape}\tshape of df_all before removal')
df_all = df_all[[col for col in df_all.columns if col!=title_name]]
print(f'{df_all.shape}\tshape of df_all after removal')

(3849, 54)	shape of df_all before removal
(3849, 53)	shape of df_all after removal


#### problematic viziers

In [29]:
vizier__35_title_list = sl.read_csv('vizier__35_title_list')
vizier__35_title_list = vizier__35_title_list.set_index('ID_person')
mask_1 = (vizier__35_title_list['vizier']==1)
mask_2 = (vizier__35_title_list.sum(axis=1)<=2)
vizier_with_zero_titles = vizier__35_title_list.loc[mask_1 &  mask_2, :].index.to_list()
print(f'{vizier_with_zero_titles}\tviziers with zero titles')

CSV file loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\unpacked\df_person_all\merged\vizier__35_title_list.csv
[481, 856, 2698, 3563, 3600]	viziers with zero titles


In [30]:
# these viziers have only vizier title, nothing else
df_help = vizier__35_title_list.loc[vizier_with_zero_titles, :]
df_help.loc[:, df_help.sum(axis=0)>0]

Unnamed: 0_level_0,vizier,tAyty TAty (n) zAb
ID_person,Unnamed: 1_level_1,Unnamed: 2_level_1
481,1,1.0
856,1,1.0
2698,1,1.0
3563,1,1.0
3600,1,1.0


**NOTE:**
- There are vizier, that have only vizier title, nothink else.
- There is no pattern to model on these persons, I will removed them.

In [31]:
print(f'{df_all.shape}\tshape of df_all before removal of viziers')
df_all = df_all.loc[~df_all.index.isin(vizier_with_zero_titles), :]
print(f'{df_all.shape}\tshape of df_all after removal of viziers')

(3849, 53)	shape of df_all before removal of viziers
(3844, 53)	shape of df_all after removal of viziers


### final analysis

In [32]:
print(f'ORIGINAL SHAPE')
print(f'{df_all.shape}\tshape of df_all')
print(f'{df_train.shape}\tshape of df_train')
print(f'{df_val.shape}\tshape of df_val')
print(f'{df_test.shape}\tshape of df_test')

columns_final = df_all.columns
df_train =  df_train.loc[df_train.index.isin(df_all.index), columns_final]
df_val = df_val.loc[df_val.index.isin(df_all.index), columns_final]
df_test = df_test.loc[df_test.index.isin(df_all.index), columns_final]

print(f'\nSHAPE AFTER POSTPROCESSING')
print(f'{df_all.shape}\tshape of df_all')
print(f'{df_train.shape}\tshape of df_train')
print(f'{df_val.shape}\tshape of df_val')
print(f'{df_test.shape}\tshape of df_test')

ORIGINAL SHAPE
(3844, 53)	shape of df_all
(2694, 55)	shape of df_train
(577, 55)	shape of df_val
(578, 55)	shape of df_test

SHAPE AFTER POSTPROCESSING
(3844, 53)	shape of df_all
(2690, 53)	shape of df_train
(577, 53)	shape of df_val
(577, 53)	shape of df_test


In [33]:
df_stat_new = get_split_stats(df_all, df_train, df_val, df_test)

In [41]:
# print problematic features
threshold_low = 0.8
threshold_up =  1.4
columns = ['count_all', 'count_train', 'count_val', 'count_test', 'ratio_train', 'ratio_val', 'ratio_test']
mask_1 = (df_stat_new['ratio_train']<threshold_low) | (df_stat_new['ratio_train']>threshold_up)
mask_2 = (df_stat_new['ratio_val']<threshold_low)   | (df_stat_new['ratio_val']>threshold_up)
mask_3 = (df_stat_new['ratio_test']<threshold_low)  | (df_stat_new['ratio_test']>threshold_up)
mask = mask_1 | mask_2 | mask_3
df_stat_new.loc[mask, columns]

Unnamed: 0,count_all,count_train,count_val,count_test,ratio_train,ratio_val,ratio_test
HAty-a,95.0,71.0,11.0,13.0,1.067987,0.771395,0.911648
r P nb,23.0,14.0,4.0,5.0,0.869824,1.158617,1.448271
imy-rA izwy (n) Xkr(w) ncwt,21.0,13.0,3.0,5.0,0.884617,0.951721,1.586201
wr 5 (m) pr 9Hwty,17.0,11.0,2.0,4.0,0.924645,0.78377,1.56754
Xry-Hbt,201.0,144.0,34.0,23.0,1.023759,1.126913,0.762324
imy-rA prwy-nbw,20.0,10.0,5.0,5.0,0.714498,1.665511,1.665511
cmcw cnwt,19.0,13.0,2.0,4.0,0.977734,0.701268,1.402536
iwn knmwt,61.0,43.0,11.0,7.0,1.007325,1.201352,0.764497
zA ncwt n Xt.f,31.0,23.0,3.0,5.0,1.060223,0.644714,1.074523
imy iz,23.0,18.0,3.0,2.0,1.118345,0.868962,0.579308


### save

In [35]:
# define splits for each feature version
features_v1 = [col for col in df_v1.columns if col in df_all.columns]
df_train_v1 =  df_train.loc[:, features_v1]
df_val_v1 = df_val.loc[:, features_v1]
df_test_v1 = df_test.loc[:, features_v1]

features_v2 = [col for col in df_v2.columns if col in df_all.columns]
df_train_v2 =  df_train.loc[:, features_v2]
df_val_v2 = df_val.loc[:, features_v2]
df_test_v2 = df_test.loc[:, features_v2]

In [36]:
# check of version 1
print(f'VERSION 1, SHAPES')
print(f'{df_train_v1.shape}\tshape of df_train_v1')
print(f'{df_val_v1.shape}\tshape of df_val_v1')
print(f'{df_test_v1.shape}\tshape of df_test_v1')

print(f'\nVERSION 1, VIZIER COUNTS')
print(f'{df_train_v1["vizier"].sum()}\tcount of vizier in df_train_v1')
print(f'{df_val_v1["vizier"].sum()}\tcount of vizier in df_val_v1')
print(f'{df_test_v1["vizier"].sum()}\tcount of vizier in df_test_v1')

VERSION 1, SHAPES
(2690, 44)	shape of df_train_v1
(577, 44)	shape of df_val_v1
(577, 44)	shape of df_test_v1

VERSION 1, VIZIER COUNTS
52	count of vizier in df_train_v1
12	count of vizier in df_val_v1
11	count of vizier in df_test_v1


In [37]:
# check of version 2
print(f'VERSION 2, SHAPES')
print(f'{df_train_v2.shape}\tshape of df_train_v2')
print(f'{df_val_v2.shape}\tshape of df_val_v2')
print(f'{df_test_v2.shape}\tshape of df_test_v2')

print(f'\nVERSION 2, VIZIER COUNTS')
print(f'{df_train_v2["vizier"].sum()}\tcount of vizier in df_train_v2')
print(f'{df_val_v2["vizier"].sum()}\tcount of vizier in df_val_v2')
print(f'{df_test_v2["vizier"].sum()}\tcount of vizier in df_test_v2')

VERSION 2, SHAPES
(2690, 53)	shape of df_train_v2
(577, 53)	shape of df_val_v2
(577, 53)	shape of df_test_v2

VERSION 2, VIZIER COUNTS
52	count of vizier in df_train_v2
12	count of vizier in df_val_v2
11	count of vizier in df_test_v2


In [38]:
# SAVE
save_df(df_train_v1, 'df_vizier_train_v1', save_index=True)
save_df(df_val_v1, 'df_vizier_val_v1', save_index=True)
save_df(df_test_v1, 'df_vizier_test_v1', save_index=True)

save_df(df_train_v2, 'df_vizier_train_v2', save_index=True)
save_df(df_val_v2, 'df_vizier_val_v2', save_index=True)
save_df(df_test_v2, 'df_vizier_test_v2', save_index=True)

Dataframe saved into C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_train_v1.csv
Dataframe saved into C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_val_v1.csv
Dataframe saved into C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_test_v1.csv
Dataframe saved into C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_train_v2.csv
Dataframe saved into C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_val_v2.csv
Dataframe saved into C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\df_vizier_test_v2.csv
