# Encoding title to clusters

**Motivation:** Encode 35_title column into clusters

In [3]:
import os
import numpy as np
import pandas as pd
import set_path
import supp.support_load as sl
from supp.support_load import read_csv, read_excel
from supp.support_merge import merge, group_to_list
from supp.support_save import save_df
from supp.support_analyzer import make_excel_analysis
from supp.support_get_mapping import get_category_aggregation

In [4]:
dfs, dfs_name, dfs_export_date = sl.load_pickle()
iton, ntoi = sl.get_name_dicts(dfs_name)
dfs_export_date

Pickle database loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\dfs_complete.pickle
Applied preprocessing: merge_on_jones
Applied preprocessing: remove_jones_duplicates


'2024-10-11'

### Load title map

In [5]:
map_title = read_csv('map_title')
print(f'{map_title.shape}\tmap_title shape')
map_title.head()

CSV file loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\data_map\map_title.csv
(2219, 6)	map_title shape


Unnamed: 0,ID_title,title,translation_of_title,type,category,sub_category
0,1,... iry cA-pr,,,,no subcategory
1,2,... n Hwt-wrt,... of the great court/Hall of Justice,administrative title – legal matters,administrative title,legal matters
2,3,... nb.f,,epithet,epithet,no subcategory
3,4,... ncwt,,,,no subcategory
4,5,... pr-aA,... of the Great House,,,no subcategory


In [6]:
map_title[map_title['title'].duplicated(keep=False)]

Unnamed: 0,ID_title,title,translation_of_title,type,category,sub_category
388,460,imy-rA gc(w),overseer of a gang/side (phyle/gang of workers),administrative title – organization of labour,,no subcategory
2157,2293,imy-rA gc(w),overseer of leather-workers,,,


In [7]:
# title to category
category_dict = map_title.set_index('title')['category'].to_dict()
len(category_dict)

2218

In [8]:
# title category to cluster
cluster_dict = get_category_aggregation()
cluster_dict[np.nan] = ''
len(cluster_dict)
cluster_dict

{'priestly title': 'priest',
 'administrative title': 'administration',
 'epithet': 'epithet',
 'rank title': 'rank',
 'family relation': 'family',
 'privy to the secret': 'privy',
 'honorific title': 'rank',
 'royal affiliation': 'rank',
 'privacy of king': 'privy',
 'craft': 'worker',
 'body care': 'worker',
 'household management': 'worker',
 'entertainment': 'worker',
 'physician': 'worker',
 'privacy of great house': 'privy',
 'religious title': 'priest',
 'epithet of queen': 'epithet',
 'municipal administration': 'administration',
 'education': 'worker',
 'privacy of palace': 'privy',
 'legal matters': 'administration',
 'farming': 'worker',
 'uncertain': 'uncertain',
 'foreign land': 'administration',
 nan: ''}

In [9]:
# encode tile to cluster
ttoc = {title: cluster_dict[category_dict[title]] for title in map_title['title']}
print(len(ttoc))

2218


### Make pivot table

In [10]:
map_title['cluster'] = map_title['category'].map(cluster_dict)
# show result map
map_title.head()

Unnamed: 0,ID_title,title,translation_of_title,type,category,sub_category,cluster
0,1,... iry cA-pr,,,,no subcategory,
1,2,... n Hwt-wrt,... of the great court/Hall of Justice,administrative title – legal matters,administrative title,legal matters,administration
2,3,... nb.f,,epithet,epithet,no subcategory,epithet
3,4,... ncwt,,,,no subcategory,
4,5,... pr-aA,... of the Great House,,,no subcategory,


In [11]:
# inspect uncertain
map_title[map_title['cluster']=='uncertain']

Unnamed: 0,ID_title,title,translation_of_title,type,category,sub_category,cluster
9,10,...tT,,uncertain,uncertain,no subcategory,uncertain
10,15,[Hm]t.f,his wife,family relation,uncertain,no subcategory,uncertain
2066,2193,imy-rA Hwt mHa,overseer of the house of flax,,uncertain,no subcategory,uncertain


In [12]:
# cluster "family relation" as "family" 
map_title.loc[map_title['type']=='family relation', 'cluster'] = 'family'

In [13]:
# drop uncertain type
map_title.loc[map_title['cluster']=='uncertain', 'cluster'] = np.nan

In [14]:
# checl that all uncertain were renamed
map_title[map_title['cluster']=='uncertain']

Unnamed: 0,ID_title,title,translation_of_title,type,category,sub_category,cluster


In [36]:
map_title['cluster'].value_counts().sum()

2217

In [35]:
map_title['cluster'].value_counts()

cluster
                  654
administration    427
priest            348
epithet           289
family            163
worker            125
privy             117
rank               94
Name: count, dtype: int64

In [14]:
# load df_person_title
df_person_title = dfs[ntoi['df_person_title']]
print(f'{df_person_title.shape} original shape')
df_person_title.drop_duplicates(keep='first', inplace=True)
print(f'{df_person_title.shape} shape after dropped duplicates')
df_person_title.head()

(14182, 3) original shape
(14119, 3) shape after dropped duplicates


Unnamed: 0,ID_official,ID_title,ID_person
0,748,290.0,888
1,1,247.0,322
2,2,316.0,323
3,2,283.0,323
4,4,316.0,325


In [15]:
# print shape of df_person_title map 
print(f'shape={df_person_title.shape}, df_person_title shape')
print(f'shape={df_person_title["ID_person"].unique().shape}, uniques persons shape')
print(f'shape={df_person_title["ID_title"].unique().shape}, uniques titles shape')
# remove duplicates
df_titles_all = df_person_title[['ID_person', 'ID_title']]
df_titles_all.head()

shape=(14119, 3), df_person_title shape
shape=(3998,), uniques persons shape
shape=(2184,), uniques titles shape


Unnamed: 0,ID_person,ID_title
0,888,290.0
1,322,247.0
2,323,316.0
3,323,283.0
4,325,316.0


In [16]:
data = df_titles_all.merge(map_title, on='ID_title', how='left')
data['cluster'] = data['cluster'].replace('', 'none')
print(data.shape)
data.head()

(14119, 8)


Unnamed: 0,ID_person,ID_title,title,translation_of_title,type,category,sub_category,cluster
0,888,290.0,Smcw pr-aA,follower of the Great House,,,no subcategory,none
1,322,247.0,iry xt ncwt,property custodian of the king,rank title,administrative title,legal matters,administration
2,323,316.0,xnty-S,"land-tenant/tenant-landholder, xnty-S official...",,,no subcategory,none
3,323,283.0,pr(y)-aA,Belonging to the Great Court,,rank title,no subcategory,rank
4,325,316.0,xnty-S,"land-tenant/tenant-landholder, xnty-S official...",,,no subcategory,none


In [17]:
encoding_to = 'cluster'
data_job = data[['ID_person', encoding_to]].copy()
pivot_table_counts = data_job.pivot_table(index='ID_person', columns=encoding_to, aggfunc='size', fill_value=0)
# save pivot_table_counts
save_df(pivot_table_counts, f'person_title_{encoding_to}_pivot_v2', save_index=True)

data_job.drop_duplicates(inplace=True)
pivot_table = data_job.pivot_table(index='ID_person', columns=encoding_to, aggfunc='size', fill_value=0)
# save pivot_table_counts
save_df(pivot_table_counts, f'person_title_{encoding_to}_ohe_v2', save_index=True)

Dataframe saved into C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\person_title_cluster_pivot_v2.csv
Dataframe saved into C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\person_title_cluster_ohe_v2.csv


In [18]:
print(pivot_table_counts.shape)
pivot_table_counts.head()

(3977, 8)


cluster,administration,epithet,family,none,priest,privy,rank,worker
ID_person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,16,12,4,20,24,11,7,6
2,0,3,4,4,3,0,2,0
4,8,4,0,7,3,0,2,0
5,6,4,0,4,0,0,2,0
6,1,4,5,11,3,0,0,1


In [19]:
pivot_table.head()

cluster,administration,epithet,family,none,priest,privy,rank,worker
ID_person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1,1,1,1,1,1,1,1
2,0,1,1,1,1,0,1,0
4,1,1,0,1,1,0,1,0
5,1,1,0,1,0,0,1,0
6,1,1,1,1,1,0,0,1


### merge vizier to pivot_table_counts

In [20]:
vizier = read_csv('vizier')
print(vizier.shape)
vizier.head()

CSV file loaded.
C:\Users\Stoja\OneDrive\Documents\diplomka\scr\data\unpacked\df_person_all\vizier.csv
(4962, 2)


Unnamed: 0,ID_person,vizier
0,322,0
1,323,0
2,324,0
3,325,0
4,326,0


In [21]:
vizier_pivot_table_counts = pd.merge(vizier, pivot_table_counts.reset_index(), on='ID_person')
vizier_pivot_table_counts

Unnamed: 0,ID_person,vizier,administration,epithet,family,none,priest,privy,rank,worker
0,322,0,1,0,0,0,0,0,0,0
1,323,0,0,0,0,1,0,0,1,0
2,325,0,0,0,0,1,0,0,0,0
3,326,0,0,0,0,1,0,0,0,0
4,327,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
3972,4959,0,0,0,0,0,1,0,0,0
3973,4960,0,3,0,0,0,0,0,0,0
3974,4961,0,1,1,0,1,0,0,1,1
3975,4962,0,2,0,0,8,0,2,2,0


In [22]:
workers = vizier_pivot_table_counts[vizier_pivot_table_counts['worker']>0]
print(workers.shape)
workers[workers['vizier']==1]

(387, 10)


Unnamed: 0,ID_person,vizier,administration,epithet,family,none,priest,privy,rank,worker
117,1,1,16,12,4,20,24,11,7,6
123,56,1,0,2,0,5,0,2,0,1
705,1242,1,5,11,2,9,7,1,6,2
880,117,1,6,6,2,5,6,4,3,1
1116,1686,1,12,6,5,12,11,6,2,2
1152,240,1,4,5,0,7,3,1,3,2
1252,29,1,4,6,1,7,0,3,0,2
1359,1926,1,4,0,0,5,7,1,2,2
1586,2183,1,11,4,2,7,3,6,5,2
1615,70,1,8,6,1,10,5,5,3,2


In [23]:
workers[workers['vizier']==0]

Unnamed: 0,ID_person,vizier,administration,epithet,family,none,priest,privy,rank,worker
60,419,0,0,0,0,0,0,0,0,1
61,420,0,0,0,0,1,0,0,0,1
66,430,0,0,0,0,0,0,0,1,1
67,431,0,0,0,0,1,0,0,0,1
74,441,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
3943,4926,0,0,0,1,0,0,0,0,1
3946,315,0,5,0,0,11,2,2,0,1
3950,4932,0,0,0,0,2,0,0,1,1
3974,4961,0,1,1,0,1,0,0,1,1


In [24]:
print(f'{vizier.shape}\tshape of vizier')
print(f'{workers.shape}\tshape of workers')

(4962, 2)	shape of vizier
(387, 10)	shape of workers


In [25]:
workers.describe()

Unnamed: 0,ID_person,vizier,administration,epithet,family,none,priest,privy,rank,worker
count,387.0,387.0,387.0,387.0,387.0,387.0,387.0,387.0,387.0,387.0
mean,2210.713178,0.121447,2.914729,1.754522,0.617571,2.997416,1.669251,0.852713,1.198966,1.390181
std,1576.418104,0.327069,3.395435,2.57211,1.004395,3.588625,2.604134,1.445562,1.439343,0.863903
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,520.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2284.0,0.0,2.0,1.0,0.0,2.0,1.0,0.0,1.0,1.0
75%,3605.5,0.0,5.0,2.0,1.0,4.0,2.0,1.0,2.0,1.0
max,4961.0,1.0,23.0,16.0,7.0,21.0,24.0,11.0,7.0,6.0
