In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
people_df = pd.read_csv("../output/persons.csv", sep = ";", encoding = "utf8")
jobs_df = pd.read_csv("../output/jobs.csv", sep = ";", encoding = "utf8")
voyages_df = pd.read_csv("../output/voyages.csv", sep = ";", encoding = "utf8")
ships_df = pd.read_csv("../output/ships.csv", sep = ";", encoding = "utf8")

Add a list of ranks ordered according to EIC pay grades

In [3]:
ranks_ordered_list = [
    "apprentice", "purser", "seaman", 
    "capt's servant", "quarter master", "midshipman", "coxswain",
    "lieutenant", "boatswain", "6th mate", "5th mate", 
    "4th mate", "surgeon's mate", "surgeon",
    "3rd mate", "2nd mate", "1st mate", 
    "master", "capt", "passenger"
]

Categorise ranks

In [4]:
rank_category_dict = {
"before_mast": ["apprentice", "purser", "seaman"],
"middle": ["capt's servant", "quarter master", "midshipman", "coxswain", "lieutenant", "boatswain"],
"low_mates": ["6th mate", "5th mate", "4th mate", "surgeon's mate", "surgeon"],
"high_mates": ["3rd mate", "2nd mate", "1st mate"],
"master_capt": ["master", "capt", "passenger"]
}

## Initial exploration

How many jobs are there?

In [5]:
len(jobs_df.index)

42955

How much of that does each rank take up?

In [6]:
job_percentages_df = pd.DataFrame(jobs_df.value_counts("rank"))
job_percentages_df = job_percentages_df.reset_index().rename(columns = {0: "count"})
job_percentages_df["percentage"] = job_percentages_df["count"] / job_percentages_df["count"].sum()
job_percentages_df

Unnamed: 0,rank,count,percentage
0,capt,5585,0.13002
1,surgeon,4890,0.11384
2,2nd mate,4786,0.111419
3,1st mate,4615,0.107438
4,3rd mate,4046,0.094192
5,purser,3923,0.091328
6,4th mate,3815,0.088814
7,midshipman,3306,0.076964
8,5th mate,2930,0.068211
9,6th mate,1730,0.040275


What's the average crew size per voyage?

In [7]:
voyages_ships_df = pd.merge(voyages_df, ships_df,
                           how = "left",
                           on = "ship_id")
voyages_ships_df = voyages_ships_df.loc[voyages_ships_df["crew"].notna()]
voyages_ships_df["crew"].mean(), 1 / voyages_ships_df["crew"].mean()

(87.19080174021131, 0.01146909972200442)

## Careers

How did employees start and end their careers? Since we are interested in career progression, we filter out those careers that only have a single job.

In [8]:
people_with_jobs = list(set(jobs_df["person_id"].tolist()))
first_rank_dict = {}
for rank in ranks_ordered_list:
    first_rank_dict[rank] = 0
for person in people_with_jobs:
    rank_list = jobs_df.loc[jobs_df["person_id"] == person]["rank"].tolist()
    if len(rank_list) > 1:
        first_rank = rank_list[0]
        first_rank_dict[first_rank] += 1
first_rank_df = pd.DataFrame.from_dict(first_rank_dict, orient = "index")
first_rank_df = first_rank_df.reset_index().rename(columns = {"index": "rank", 0: "count"})
first_rank_df = first_rank_df.sort_values(by = "count", ascending = False)

What are the percentages of ranks in the first ranks of careers?

In [9]:
first_rank_df["percentage"] = first_rank_df["count"] / first_rank_df["count"].sum()
first_rank_df.head()

Unnamed: 0,rank,count,percentage
5,midshipman,1043,0.171292
13,surgeon,744,0.122188
2,seaman,678,0.111348
1,purser,598,0.09821
10,5th mate,530,0.087042


In [10]:
last_rank_dict = {}
for rank in ranks_ordered_list:
    last_rank_dict[rank] = 0
for person in people_with_jobs:
    rank_list = jobs_df.loc[jobs_df["person_id"] == person]["rank"].tolist()
    if len(rank_list) > 1:
        last_rank = rank_list[-1]
        last_rank_dict[last_rank] += 1
last_rank_df = pd.DataFrame.from_dict(last_rank_dict, orient = "index")
last_rank_df = last_rank_df.reset_index().rename(columns = {"index": "rank", 0: "count"})
last_rank_df = last_rank_df.sort_values(by = "count", ascending = False)

What are the percentages of ranks in the last ranks of careers?

In [11]:
last_rank_df["percentage"] = last_rank_df["count"] / last_rank_df["count"].sum()
last_rank_df.head()

Unnamed: 0,rank,count,percentage
18,capt,1275,0.209394
16,1st mate,916,0.150435
13,surgeon,887,0.145673
15,2nd mate,851,0.13976
14,3rd mate,641,0.105272


Let's plot how people moved from a first to a last job.

In [12]:
rank_id_dict = {}
counter = 1
for rank in ranks_ordered_list:
    rank_id_dict[rank] = counter
    counter += 1
with open("./jobs/nodes.csv", "w") as file:
    file.write("ID;label;category;count\n")
    for category in rank_category_dict:
        for rank in rank_category_dict[category]:
            count = str(len(jobs_df.loc[jobs_df["rank"] == rank].index))
            file.write(str(rank_id_dict[rank]) + ";" + rank + ";" + category + ";" + count + "\n")

In [13]:
first_last_rank_dict = {}
for first_rank in ranks_ordered_list:
    first_last_rank_dict[first_rank] = {}
    for last_rank in ranks_ordered_list:
        first_last_rank_dict[first_rank][last_rank] = 0
for person in people_with_jobs:
    rank_list = jobs_df.loc[jobs_df["person_id"] == person]["rank"].tolist()
    if len(rank_list) > 1:
        first_rank = rank_list[0]
        last_rank = rank_list[-1]
        first_last_rank_dict[first_rank][last_rank] += 1
with open("./jobs/first_last_edges.csv", "w") as file:
    file.write("Source;Target;Weight;Direction\n")
    for first_rank in first_last_rank_dict:
        for last_rank in first_last_rank_dict[first_rank]:
            if not first_last_rank_dict[first_rank][last_rank] == 0:
                first_rank_id = rank_id_dict[first_rank]
                last_rank_id = rank_id_dict[last_rank]
                file.write(str(first_rank_id) + ";" + str(last_rank_id) + ";" + str(first_last_rank_dict[first_rank][last_rank]) + ";")
                if last_rank_id > first_rank_id:
                    file.write("promotion\n")
                elif last_rank_id == first_rank_id:
                    file.write("unchanged\n")
                else:
                    file.write("demotion\n")
first_last_rank_df = pd.DataFrame.from_dict(first_last_rank_dict, orient = "index")

In [14]:
def get_od_matrix(df, name):
    styled_df = df.style.background_gradient(cmap = "viridis", axis = 1)
    styled_df = styled_df.set_table_styles([
        {'selector': 'table', 'props': [('font-family', 'Helvetica')]},
        {'selector': 'td.data', 'props': [('font-family', 'Helvetica'), ('border', 'none')],},
        {'selector': 'th.col_heading', 'props': [('font-family', 'Helvetica')]},
        {'selector': 'th.row_heading', 'props': [('font-family', 'Helvetica')]}
    ])
    with open("jobs/" + name + ".html", "w") as file:
        file.write(styled_df.render())
    return styled_df
first_last_rank_styled_df = get_od_matrix(first_last_rank_df, "first_last_job")
first_last_rank_styled_df

  file.write(styled_df.render())


Unnamed: 0,apprentice,purser,seaman,capt's servant,quarter master,midshipman,coxswain,lieutenant,boatswain,6th mate,5th mate,4th mate,surgeon's mate,surgeon,3rd mate,2nd mate,1st mate,master,capt,passenger
apprentice,0,1,0,0,0,1,0,2,0,1,2,28,0,2,42,37,45,3,47,0
purser,0,491,1,0,0,0,0,0,0,4,10,8,0,15,14,11,4,2,38,0
seaman,1,6,2,0,0,6,0,1,0,5,7,108,0,2,109,125,130,1,175,0
capt's servant,0,1,0,0,0,0,0,0,0,2,0,27,0,0,29,33,32,0,47,0
quarter master,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
midshipman,0,27,1,0,0,14,0,2,0,25,39,175,0,12,144,205,194,4,201,0
coxswain,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
lieutenant,0,0,0,0,0,0,0,0,0,0,0,3,0,1,1,3,1,0,0,0
boatswain,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6th mate,0,14,0,0,0,1,0,0,0,26,51,58,0,9,46,49,44,0,46,0


Only the ones with more data

In [15]:
first_last_rank_detail_df = first_last_rank_df.drop(columns = {"purser", "surgeon's mate", "surgeon", "master", "passenger", "boatswain", "coxswain", "lieutenant", "quarter master"},
                                          index = {"purser", "surgeon's mate", "surgeon", "master", "passenger", "boatswain", "coxswain", "lieutenant", "quarter master"})
first_last_rank_detail_styled_df = get_od_matrix(first_last_rank_detail_df, "first_last_job_detail")
first_last_rank_detail_styled_df

  file.write(styled_df.render())


Unnamed: 0,apprentice,seaman,capt's servant,midshipman,6th mate,5th mate,4th mate,3rd mate,2nd mate,1st mate,capt
apprentice,0,0,0,1,1,2,28,42,37,45,47
seaman,1,2,0,6,5,7,108,109,125,130,175
capt's servant,0,0,0,0,2,0,27,29,33,32,47
midshipman,0,1,0,14,25,39,175,144,205,194,201
6th mate,0,0,0,1,26,51,58,46,49,44,46
5th mate,0,0,0,0,4,61,86,85,99,80,97
4th mate,0,0,0,0,2,10,56,108,120,91,114
3rd mate,0,0,0,0,1,3,6,49,96,104,95
2nd mate,0,0,0,0,1,1,2,10,60,93,92
1st mate,0,0,0,0,1,0,3,1,4,83,70


And again for all career movements.

In [16]:
careers_dict = {}
for first_rank in ranks_ordered_list:
    careers_dict[first_rank] = {}
    for second_rank in ranks_ordered_list:
        careers_dict[first_rank][second_rank] = 0
for person in people_with_jobs:
    rank_list = jobs_df.loc[jobs_df["person_id"] == person]["rank"].tolist()
    if len(rank_list) > 1:
        for move in range(len(rank_list) - 1):
            first_rank = rank_list[move]
            last_rank = rank_list[move + 1]
            careers_dict[first_rank][last_rank] += 1
with open("./jobs/careers_edges.csv", "w") as file:
    file.write("Source;Target;Weight;Direction\n")
    for first_rank in careers_dict:
        for second_rank in careers_dict[first_rank]:
            if not careers_dict[first_rank][second_rank] == 0:
                first_rank_id = rank_id_dict[first_rank]
                second_rank_id = rank_id_dict[second_rank]
                file.write(str(first_rank_id) + ";" + str(second_rank_id) + ";" + str(careers_dict[first_rank][second_rank]) + ";")
                if second_rank_id > first_rank_id:
                    file.write("promotion\n")
                elif second_rank_id == first_rank_id:
                    file.write("unchanged\n")
                else:
                    file.write("demotion\n")
careers_df = pd.DataFrame.from_dict(careers_dict, orient = "index")

In [17]:
careers_detail_df = careers_df.drop(columns = {"purser", "surgeon's mate", "surgeon", "master", "passenger", "boatswain", "coxswain", "lieutenant", "quarter master"},
                                          index = {"purser", "surgeon's mate", "surgeon", "master", "passenger", "boatswain", "coxswain", "lieutenant", "quarter master"})

careers_styled_df = get_od_matrix(careers_detail_df, "careers")
careers_styled_df

  file.write(styled_df.render())


Unnamed: 0,apprentice,seaman,capt's servant,midshipman,6th mate,5th mate,4th mate,3rd mate,2nd mate,1st mate,capt
apprentice,123,63,2,49,7,11,7,5,37,16,0
seaman,4,763,12,437,74,114,100,43,86,17,2
capt's servant,0,33,81,104,23,26,4,2,5,1,0
midshipman,0,98,13,1566,335,509,400,141,116,23,1
6th mate,1,7,1,4,435,204,333,108,58,18,8
5th mate,0,11,0,5,14,646,647,439,142,41,11
4th mate,0,2,0,6,7,38,982,992,460,82,27
3rd mate,0,4,1,11,7,16,39,1178,1191,332,60
2nd mate,0,16,0,48,12,36,78,120,1804,1060,303
1st mate,1,4,0,8,4,11,18,50,136,2428,648


## The role of the father's occupation

How many fathers' jobs do we have data on and how many unique jobs are there?

In [18]:
people_father_job_df = people_df.loc[people_df["father_job"].notna()]
len(people_father_job_df.index), people_df["father_job"].nunique()

(163, 64)

In [19]:
people_df["father_job"].unique()

array([nan, 'baillie', 'oilman', 'gent', 'plumber', 'hatter', 'Maj-Gen',
       'mariner', 'vicar', 'Esq', 'sailmaker', 'Surgeon',
       'Custom House agent', 'linendraper', 'cooper', 'minister',
       'hop merchant', 'grocer', 'shipmaster', 'wine merchant',
       'labourer', 'Capt', 'attorney', 'British Consul at Madeira',
       'sumgeom', 'silk manufacturer', 'rector', 'Bart', 'biscuit baker',
       'victualler', 'coal merchant', 'dyer', 'solicitor', 'shipbuilder',
       'Jamaica)', 'shipwright', 'brandy merchant', 'schoolmaster',
       'ropemaker', "banker's clerk", 'waterman', 'bricklayer',
       'merchant', 'vieat', 'banker', 'doctor', 'lecturer', 'farmer',
       '4 merchant', 'warehouseman', 'builder', 'tmexchemt',
       'rector of Calne', 'gardener', 'mercer', 'tinier', 'jun',
       'iron merchant', 'brewer', 'surgeon',
       'midshipman NORTHAMPTON (2) 1802/3', 'Proctor in Doctors Commons',
       'maltster', 'timber merchant', 'curate'], dtype=object)

What are the most frequent fathers' jobs?

In [20]:
father_job_df = people_df.loc[people_df["father_job"].notna()]
father_job_df.value_counts("father_job")[:10]

father_job
Esq           46
gent          18
mariner       12
merchant       9
surgeon        4
vicar          3
rector         3
farmer         3
shipwright     3
Capt           3
dtype: int64

Categorise fathers' jobs

In [21]:
father_category_dict = {
"sea":                          ["mariner", "shipmaster", "capt"],
"labourers":                    ["labourer", "plumber", "builder", "waterman", 
                                "oilman", "cooper", "bricklayer", "warehouseman",
                                "gardener", "farmer"],
"artisans":
                               ["linendraper", "ropemaker", "hatter", "tinier", "dyer", "farmer", "maltster",
                                "sailmaker", "brewer", "biscuit baker", "banker's clerk", "shipwright"],
"merchants":
                               ["timber merchant", "hop merchant", "iron merchant", "coal merchant", "victualler",
                                "wine merchant", "brandy merchant", "grocer", "merchant", "silk manufacturer"],
"educated":
                               ["schoolmaster", "rector", "curate", "lecturer", "doctor", "minister",
                               "surgeon", "proctor in doctors commons", "custom house agent", "baillie", "solicitor",
                               "vicar", "attorney", "banker", "maj-gen"],
"gentry":
                               ["gent", "esq"]
}

Count categories

In [22]:
category_count_dict = {}
for category in father_category_dict:
    category_count_dict[category] = 0
for father_job in people_father_job_df["father_job"].tolist():
    for category in father_category_dict:
        if father_job.lower() in father_category_dict[category]:
            category_count_dict[category] += 1
category_count_df = pd.DataFrame.from_dict(category_count_dict, orient = "index")
category_count_df = category_count_df.reset_index().rename(columns = {"index": "category", 0: "count"})
category_count_df

Unnamed: 0,category,count
0,sea,17
1,labourers,13
2,artisans,16
3,merchants,18
4,educated,25
5,gentry,64


Did people with fathers in higher positions start in a higher position themselves?

In [23]:
people_jobs_df = pd.merge(people_father_job_df, jobs_df,
                         how = "left",
                         on = "person_id")
father_first_job_dict = {}
father_last_job_dict = {}
people_with_fathers_and_jobs = list(set(people_jobs_df.loc[people_jobs_df["father_job"].notna()]["person_id"].tolist()))
for category in father_category_dict.keys():
    father_first_job_dict[category] = {}
    father_last_job_dict[category] = {}
    for job in ranks_ordered_list:
        father_first_job_dict[category][job] = 0
        father_last_job_dict[category][job] = 0
for person in people_with_fathers_and_jobs: 
    first_job = people_jobs_df.loc[people_jobs_df["person_id"] == person]["rank"].tolist()[0]
    last_job = people_jobs_df.loc[people_jobs_df["person_id"] == person]["rank"].tolist()[-1]
    father_job = people_jobs_df.loc[people_jobs_df["person_id"] == person]["father_job"].tolist()[0].lower()
    if first_job in ranks_ordered_list:
        for category in father_category_dict.keys():
            if father_job in father_category_dict[category]:
                father_category = category
        father_first_job_dict[father_category][first_job] += 1
    if last_job in ranks_ordered_list:
        for category in father_category_dict.keys():
            if father_job in father_category_dict[category]:
                father_category = category
        father_last_job_dict[father_category][last_job] += 1
father_first_job_count_df = pd.DataFrame.from_dict(father_first_job_dict, orient = "index")
father_last_job_count_df = pd.DataFrame.from_dict(father_last_job_dict, orient = "index")

In [24]:
father_first_job_count_styled_df = get_od_matrix(father_first_job_count_df, "father_first_job")
father_first_job_count_styled_df

  file.write(styled_df.render())


Unnamed: 0,apprentice,purser,seaman,capt's servant,quarter master,midshipman,coxswain,lieutenant,boatswain,6th mate,5th mate,4th mate,surgeon's mate,surgeon,3rd mate,2nd mate,1st mate,master,capt,passenger
sea,0,0,5,2,0,6,0,0,0,1,2,0,0,0,1,0,0,0,0,0
labourers,0,0,3,0,0,3,0,0,0,1,3,0,1,0,0,0,0,0,0,0
artisans,1,1,7,1,0,6,0,0,0,1,2,0,0,0,0,1,0,0,0,0
merchants,1,1,1,1,0,10,0,0,0,4,1,0,0,0,0,0,0,0,0,0
educated,1,0,2,3,0,14,0,0,0,1,4,0,1,0,1,1,0,0,0,0
gentry,0,1,10,4,0,43,0,0,0,5,2,0,0,1,1,0,0,0,0,0


Did they end in a higher position?

In [25]:
father_last_job_count_styled_df = get_od_matrix(father_last_job_count_df, "father_last_job")
father_last_job_count_styled_df

  file.write(styled_df.render())


Unnamed: 0,apprentice,purser,seaman,capt's servant,quarter master,midshipman,coxswain,lieutenant,boatswain,6th mate,5th mate,4th mate,surgeon's mate,surgeon,3rd mate,2nd mate,1st mate,master,capt,passenger
sea,0,0,0,0,0,0,0,0,0,0,1,4,0,0,2,2,3,0,5,0
labourers,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,3,2,0,3,0
artisans,0,2,0,0,0,0,0,0,0,2,1,0,0,0,4,4,4,0,3,0
merchants,0,2,0,0,0,0,0,0,0,1,0,4,0,0,2,3,2,0,5,0
educated,0,0,0,0,0,1,0,0,0,1,6,4,0,1,2,6,2,0,5,0
gentry,0,2,0,0,0,2,0,0,0,5,7,5,0,2,12,15,9,0,8,0
