In [1]:
import pandas as pd
whoiswho = pd.read_csv("cross-verified-database.csv", encoding='utf-8')

In [2]:
occupations = pd.DataFrame(whoiswho.groupby("level3_main_occ")["name"].nunique()).rename(columns={"name": "count"})
all_occ = whoiswho["level3_main_occ"].drop_duplicates().to_list()
top_occ = whoiswho.groupby("level3_main_occ")["name"].nunique().nlargest(100)
top_occ = top_occ.index.to_list()

In [3]:
occupations.sort_values(by="count",ascending=False)

Unnamed: 0_level_0,count
level3_main_occ,Unnamed: 1_level_1
politician,270431
football,250439
actor,121761
writer,74709
painter,60175
...,...
fakultet,1
modèle,1
facteur,1
expédition,1


There are almost 5,000 unique occupations listed as the main occupation, the most common being politician, football (…player, …coach etc.) and actor. 

In [4]:
pd.DataFrame(whoiswho[whoiswho.level3_main_occ.isin(top_occ)].groupby("level3_main_occ")["birth"].nunique().sort_values(ascending=False))

Unnamed: 0_level_0,birth
level3_main_occ,Unnamed: 1_level_1
politician,1545
priest,1254
poet,1145
aristocrat,1025
king,962
...,...
judo,116
presenter,111
canoe,108
handball,100


Of the 100 most frequently listed occupation, politicians have the longest time span, with a notable politician being born in 1545 disctinct years. On the tail: computer scientists and sportspeople.

## Human Swiss army knives

Let's find the people with the highest number of listed occupations.

In [5]:
def nicerOccupations(x):
    x = x.replace("D:_","")
    x = x.split("P:")[0].split("_")
    for y in x:
        if len(y) < 2:
            x.remove(y)
    return x

In [6]:
whoiswho["all_occ"] = whoiswho["level3_all_occ"].apply(nicerOccupations)

In [7]:
whoiswho["no_occ"] = whoiswho["all_occ"].apply(lambda x: len(x))

In [8]:
whoiswho[["name","no_occ","all_occ"]].sort_values(by="no_occ",ascending=False).head(20)

Unnamed: 0,name,no_occ,all_occ
1336243,Johann_Wolfgang_von_Goethe,30,"[poet, theatre, botanist, politician, painter,..."
21862,B._R._Ambedkar,26,"[economist, politician, essayist, lawyer, juri..."
598608,Nikolai_Fomenko,24,"[singer, comedian, music, presenter, radio, sk..."
369289,Andrey_Makarevich,23,"[singer, poet, composer, presenter, actor, arc..."
12936,Madonna_(entertainer),21,"[singer, actor, film, composer, film, writer, ..."
485,Benjamin_Franklin,21,"[writer, politician, publisher, printer, theor..."
7302,Alexander_von_Humboldt,20,"[geologist, explorer, botanist, geographer, ge..."
2244795,Alexandra_Jacobi,19,"[man, of, letters, correspondent, journalist, ..."
8463,Boris_Vian,19,"[polymath, singer, poet, trumpet, lyricist, pa..."
143827,Shen_Kuo,19,"[physicist, cartographer, mathematician, astro..."


No surprise here.

## No (wo)men allowed

There are two ways to identify the professions with the largest gender gap. First we can simply take a look at the ratio of both genders. Second, we can list the most frequent occupations performed by 0 notable women or men.

In [9]:
occ_total_men = pd.Series(whoiswho[whoiswho["gender"] == "Male"].groupby(["level3_main_occ"]).size(), name="occ_total_men")
whoiswho = whoiswho.merge(occ_total_men, on = ["level3_main_occ"], how = "outer")
occ_total_women = pd.Series(whoiswho[whoiswho["gender"] == "Female"].groupby(["level3_main_occ"]).size(), name="occ_total_women")
whoiswho = whoiswho.merge(occ_total_women, on = ["level3_main_occ"], how = "outer")

In [10]:
whoiswho["gender_ratio"] = whoiswho["occ_total_men"] / whoiswho["occ_total_women"] 

In [11]:
pd.DataFrame(whoiswho.groupby("level3_main_occ")["gender_ratio"].first().nlargest(10))

Unnamed: 0_level_0,gender_ratio
level3_main_occ,Unnamed: 1_level_1
prelate,3900.0
leutnant,847.0
organbuilder,758.0
samurai,714.0
landrat,711.0
theologe,522.0
rikishi,259.5
gallantry,223.0
katholischer,220.333333
bischof,218.0


In [12]:
pd.DataFrame(whoiswho.groupby("level3_main_occ")["gender_ratio"].first().nsmallest(10))

Unnamed: 0_level_0,gender_ratio
level3_main_occ,Unnamed: 1_level_1
netball,0.006135
nun,0.006986
camogie,0.008097
actress,0.009036
beauty_pageant,0.010462
feminist,0.013025
abbess,0.013158
mistress,0.016529
wife_of,0.018587
first_lady,0.018868


In [13]:
pd.DataFrame(whoiswho[whoiswho["occ_total_men"].isnull()].groupby("level3_main_occ").size().nlargest(5))

Unnamed: 0_level_0,0
level3_main_occ,Unnamed: 1_level_1
schermitrice,149
nobildonna,146
salonnière,86
suffragist,76
principessa,64


In [14]:
pd.DataFrame(whoiswho[whoiswho["occ_total_women"].isnull()].groupby("level3_main_occ").size().nlargest(5))

Unnamed: 0_level_0,0
level3_main_occ,Unnamed: 1_level_1
hurler,1989
generalmajor,1323
flying_ace,905
geheimrat,428
adliger,344


In [15]:
occ_total = pd.Series(whoiswho.groupby(["level3_main_occ"]).size(), name="occ_total")
whoiswho = whoiswho.merge(occ_total, on = ["level3_main_occ"], how = "outer")

## Regionally specific occupations

In [16]:
whoiswho[whoiswho["occ_total"] > 300].groupby("level3_main_occ")["un_subregion"].nunique().nsmallest(25)

level3_main_occ
gaelic_football         1
révolution_française    2
hurler                  3
samurai                 3
kloster                 4
lacrosse                4
seiyū                   4
ämbetsman               4
earl                    5
geheimrat               5
graf                    5
nascar                  5
organbuilder            5
résistant               5
abt                     6
landrat                 6
nobile                  6
troubadour              6
adliger                 7
nobre                   7
romanist                7
feudatory               8
fiddler                 8
germanist               8
goldsmith               8
Name: un_subregion, dtype: int64

The [Gaelic football](https://en.wikipedia.org/wiki/Gaelic_football) is mentioned in the data of more than 300 notable individuals, all coming from a single subregion.

## My profession is X. Give me some side job ideas!

In [17]:
def side_occ(occupation, frame):
    side_occ_raw = []
    side_occ = {}
    whoiswho_filtered = frame[frame["level3_main_occ"] == occupation]
    combinations = whoiswho_filtered["all_occ"].to_list()
    for x in combinations:
        for y in x:
            if y != occupation:
                side_occ_raw.append(y)
    for x in side_occ_raw:
        count = side_occ_raw.count(x)
        side_occ[count] = x
    side_occ = pd.DataFrame.from_dict(side_occ, orient="index", columns=["second_occupation"]).sort_index(ascending=False)
    return(side_occ)

In [25]:
side_occ("architect", whoiswho).head(60)

Unnamed: 0,second_occupation
673,academic
475,engineer
469,planner
324,painter
237,designer
160,politician
153,sculptor
124,writer
84,historian
73,art


## Relations between specific fields and professions

In [19]:
level2occs = whoiswho["level2_main_occ"].drop_duplicates().to_list()
level2occs.remove("Missing")
level2occs = sorted(level2occs)
level2_main_occ = {}
for occupation in level2occs:
    level2filtered = whoiswho[whoiswho["level2_main_occ"] == occupation]
    total = level2filtered.shape[0]
    row = {}
    
    for secondoccupation in level2occs:
        level2doublefiltered = level2filtered[level2filtered["level2_second_occ"] == secondoccupation]
        partial = level2doublefiltered.shape[0]
        ratio = partial / total
        row[secondoccupation] = ratio
    
    level2_main_occ[occupation] = row
level2_main_occ = pd.DataFrame(level2_main_occ).sort_index()
level2_main_occ.style.background_gradient(axis=None, cmap='YlOrRd')

Unnamed: 0,Academia,Administration/Law,Corporate/Executive/Business (large),Culture-core,Culture-periphery,Explorer/Inventor/Developer,Family,Military,Nobility,Other,Politics,Religious,Sports/Games,Worker/Business (small)
Academia,4e-06,0.059115,0.040872,0.074289,0.062865,0.155168,0.03965,0.029477,0.008699,0.029494,0.068696,0.113168,0.006462,0.097868
Administration/Law,0.034108,0.0,0.035469,0.009655,0.01958,0.018366,0.022539,0.037118,0.022734,0.054805,0.119819,0.023833,0.004111,0.033515
Corporate/Executive/Business (large),0.024188,0.02388,0.0,0.011638,0.041166,0.104673,0.031187,0.020899,0.007605,0.062967,0.057773,0.005513,0.013489,0.101421
Culture-core,0.103575,0.030344,0.050339,0.0,0.285613,0.043351,0.05161,0.023092,0.018865,0.057686,0.021398,0.049379,0.013537,0.103395
Culture-periphery,0.037794,0.014958,0.036701,0.136831,0.0,0.047471,0.015271,0.006734,0.005257,0.023596,0.026625,0.01079,0.00732,0.0701
Explorer/Inventor/Developer,0.040181,0.004786,0.060066,0.006077,0.017819,0.0,0.018307,0.033738,0.005737,0.01413,0.013308,0.004891,0.005252,0.041367
Family,0.018865,0.014333,0.047421,0.02238,0.022968,0.030157,0.0,0.039409,0.369,0.088758,0.025356,0.041214,0.011187,0.046148
Military,0.021119,0.038304,0.036593,0.009013,0.013848,0.076751,0.067801,0.0,0.073272,0.058029,0.056025,0.016441,0.009125,0.02246
Nobility,0.007878,0.022423,0.018675,0.007053,0.0087,0.00995,0.304048,0.086181,0.0,0.04287,0.02261,0.031708,0.003296,0.026584
Other,0.015467,0.016896,0.055743,0.01254,0.038116,0.02459,0.027967,0.059037,0.022147,0.0,0.020117,0.023736,0.009018,0.027505


(Vertical axis: main occupation, horizontal axis: second occupation.)

In [20]:
def correlation(frame, size):
    top = frame.groupby("level3_main_occ")["name"].nunique().nlargest(size)
    top = top.index.to_list()
    top = sorted(top)
    matrix = {}
    
    for occupation in top:
    
        whoiswho_filtered = frame[frame["level3_main_occ"] == occupation]
        total = whoiswho_filtered.shape[0]
        combinations = whoiswho_filtered["all_occ"].to_list()
    
        side_occ_raw = []
    
        for x in combinations:
            for y in x:
                if y in top:
                    side_occ_raw.append(y)
    
        row = {}
        side_occ_raw = sorted(side_occ_raw)
        
        notanymore = []
        
        for x in side_occ_raw:
            if x not in notanymore:
                count = side_occ_raw.count(x)
                ratio = count / total
                row[x] = ratio
                notanymore.append(x)
    
        matrix[occupation] = row
    
    matrix = pd.DataFrame(matrix).sort_index()
    
    return(matrix.style.background_gradient(axis=None, cmap='YlOrRd'))

In [21]:
correlation(whoiswho[(whoiswho["level2_main_occ"] != "Sports/Games")], 11)

Unnamed: 0,actor,composer,film,journalist,lawyer,music,painter,politician,priest,singer,writer
actor,1.505575,0.012125,0.169485,0.005766,0.001507,0.018771,0.002143,0.000643,0.000178,0.136057,0.011858
composer,0.007037,0.98173,0.008455,0.000277,0.000301,0.093855,0.001578,0.000152,0.000565,0.093353,0.003292
film,0.046482,0.0448,1.141243,0.006348,0.000402,0.007765,0.001445,0.000137,5.9e-05,0.004876,0.008392
journalist,0.005583,0.005749,0.026785,0.983117,0.025798,0.012076,0.00324,0.018239,0.00333,0.00372,0.126934
lawyer,0.000591,0.000892,0.001208,0.002772,0.794519,0.00052,0.000282,0.035821,0.000922,0.000525,0.003801
music,0.010149,0.233514,0.00675,0.004436,0.000528,0.985142,0.001545,0.000218,0.000357,0.07923,0.006705
painter,0.002086,0.003436,0.007282,0.000554,0.000477,0.002446,0.998671,0.000436,0.000476,0.002459,0.007589
politician,0.003054,0.002478,0.005719,0.022261,0.34349,0.002813,0.002343,0.951537,0.011595,0.00269,0.01574
priest,0.000107,0.00294,7.1e-05,0.000499,0.000904,0.00107,0.000332,0.001538,0.970715,0.00021,0.004002
singer,0.056368,0.042553,0.004867,0.000776,0.000402,0.086334,0.00113,0.000189,8.9e-05,1.04365,0.002476
