In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
ops_per_sec = pd.read_csv('../data/ops_per_second.csv')

In [3]:
transistors_per_year = pd.read_csv('../data/transistor_count.csv')

In [4]:
life_exp = pd.read_csv('../data/life_expectancy.csv')

In [5]:
life_exp = life_exp.drop(columns = {'upper', 'lower', 'measure', 'metric', 'age'}).rename(columns = {'val' : 'life_exp'})

In [6]:
death_counts = pd.read_csv('../data/death_counts.csv')

In [7]:
death_counts = death_counts.loc[death_counts['age'] == 'All ages']

In [8]:
death_counts = death_counts.drop(columns = {'measure', 'metric', 'upper', 'lower', 'age'}).rename(columns = {'val' : 'death_count'})

In [9]:
population = pd.read_csv('../data/population.csv')

In [10]:
population = population.loc[population['age'] == 'All ages']

In [11]:
population = population.drop(columns = { 'measure', 'metric', 'upper', 'lower', 'age'}).rename(columns = {'val' : 'population'})

In [12]:
death_rate = pd.read_csv('../data/death_rate(per100k).csv')

In [13]:
death_rate = death_rate.loc[death_rate['age'] == 'All ages']

In [14]:
death_rate = death_rate.drop(columns = {'measure', 'sex', 'age', 'metric', 'upper', 'lower'}).rename(columns = {'val' : 'death rate(per 100k)'})

In [15]:
prev_rate = pd.read_csv('../data/prevelance_rate(per100k).csv')

In [16]:
prev_rate = prev_rate.loc[prev_rate['age'] == 'All ages']

In [17]:
prev_rate = prev_rate.drop(columns = {'measure', 'sex', 'age', 'metric', 'upper', 'lower'})

In [18]:
prev_rate = prev_rate.rename(columns = {'val' : 'prevelance rate(per 100k)'})

In [19]:
death_data = life_exp.merge(population).merge(death_counts).merge(death_rate).merge(prev_rate)

In [20]:
death_data['lethality'] = (death_data['death rate(per 100k)'] / death_data['prevelance rate(per 100k)']) * 100

In [21]:
pc_performance = transistors_per_year.merge(ops_per_sec, how = 'left')

In [22]:
pc_performance = pc_performance.rename(columns = {'Transistors per microprocessor' : 'transistor_count(thousand)', 'Floating-Point Operations per Second (GFLOPS)' : 'GFLOPS'})

In [23]:
pc_performance['transistor_count(thousand)'] = (pc_performance['transistor_count(thousand)'] / 1000).astype(int)

In [24]:
pc_performance = pc_performance.drop(columns = {'Entity', 'Code'})

In [25]:
zero_death_count = death_data.loc[death_data['death_count'] == 0]

In [26]:
death_data = death_data[~death_data.isin(zero_death_count)].dropna()

In [27]:
cost_per_gflop = pd.read_csv('../data/cost_of_computing.csv', index_col = 0)

In [28]:
cost_per_gflop = cost_per_gflop.rename(columns = {'Unadjusted' : 'unadjusted_cost_per_gflop', '2023[77]' : 'inflation_adjusted_cost_per_gflop', 'year' : 'Year'})

In [29]:
pc_performance = cost_per_gflop.merge(pc_performance, how = 'left')

In [30]:
storage_costs = pd.read_csv('../data/storage_costs.csv')

In [31]:
storage_costs = storage_costs.drop(columns = {'Entity', 'Code'})

In [32]:
storage_costs['Historical price of memory'] = storage_costs['Historical price of memory'] / 1000

In [33]:
storage_costs = storage_costs.rename(columns = {'Historical price of memory' : 'price_of_memory(GB)', 'Historical price of flash memory' : 'price_of_flash_memory(TB)', 'Historical price of disk drives' : 'price_of_disk_storage(TB)', 'Historical price of solid-state drives' : 'price_of_solid_state_storage(TB)'})

In [34]:
pc_performance = pc_performance.merge(storage_costs, how = 'right')

In [35]:
pc_performance

Unnamed: 0,Year,unadjusted_cost_per_gflop,inflation_adjusted_cost_per_gflop,transistor_count(thousand),GFLOPS,price_of_memory(GB),price_of_flash_memory(TB),price_of_disk_storage(TB),price_of_solid_state_storage(TB)
0,1957,,,,,3.786967e+12,,,
1,1959,,,,,6.032806e+11,,6.747736e+10,
2,1960,,,,,4.588051e+10,,3.150364e+10,
3,1965,,,,,2.170492e+10,,,
4,1970,,,,,4.892936e+09,,1.731185e+09,
...,...,...,...,...,...,...,...,...,...
60,2022,$0.02,$0.02,,,1.861932e+00,,1.263745e+01,39.796032
61,2023,$0.01,$0.01,,,1.088437e+00,,1.061740e+01,25.906467
62,1956,,,,,,,8.759275e+10,
63,1964,,,,,,,2.936028e+10,


In [36]:
death_data_countries = death_data.loc[(death_data['location'] != 'High SDI') & (death_data['location'] != 'Low SDI')]

In [37]:
death_data_economic = death_data.loc[(death_data['location'] == 'High SDI') | (death_data['location'] == 'Low SDI')]

In [38]:
death_data_economic

Unnamed: 0,location,sex,year,life_exp,population,cause,death_count,death rate(per 100k),prevelance rate(per 100k),lethality
136416,Low SDI,Both,1990,53.050923,5.013006e+08,Cardiovascular diseases,688215.591377,137.286015,3925.672344,3.497134
136417,Low SDI,Both,1990,53.050923,5.013006e+08,Skin and subcutaneous diseases,6532.615771,1.303133,27015.921674,0.004824
136418,Low SDI,Both,1990,53.050923,5.013006e+08,Transport injuries,126533.089972,25.240962,974.030877,2.591392
136419,Low SDI,Both,1990,53.050923,5.013006e+08,Neoplasms,241051.010315,48.085125,435.079755,11.052025
136420,Low SDI,Both,1990,53.050923,5.013006e+08,Musculoskeletal disorders,2709.371332,0.540468,10568.804436,0.005114
...,...,...,...,...,...,...,...,...,...,...
137755,High SDI,Both,2021,80.221240,1.094048e+09,Other non-communicable diseases,184630.333968,16.875894,64853.005818,0.026022
137756,High SDI,Both,2021,80.221240,1.094048e+09,Skin and subcutaneous diseases,26754.858667,2.445493,29786.095181,0.008210
137757,High SDI,Both,2021,80.221240,1.094048e+09,Transport injuries,108950.726358,9.958498,3440.010109,0.289490
137758,High SDI,Both,2021,80.221240,1.094048e+09,Substance use disorders,129350.838154,11.823144,3775.700152,0.313138


In [39]:
death_data_countries

Unnamed: 0,location,sex,year,life_exp,population,cause,death_count,death rate(per 100k),prevelance rate(per 100k),lethality
0,Somalia,Both,1990,48.246310,7.938721e+06,Neglected tropical diseases and malaria,10062.248368,126.748990,76709.640404,0.165232
1,Somalia,Both,1990,48.246310,7.938721e+06,Chronic respiratory diseases,1950.540183,24.569956,9403.276758,0.261291
2,Somalia,Both,1990,48.246310,7.938721e+06,Transport injuries,1854.778588,23.363696,729.936509,3.200785
3,Somalia,Both,1990,48.246310,7.938721e+06,Unintentional injuries,3809.880275,47.991111,6103.958675,0.786229
4,Somalia,Both,1990,48.246310,7.938721e+06,Neoplasms,3889.030971,48.988132,289.944445,16.895696
...,...,...,...,...,...,...,...,...,...,...
136411,United Republic of Tanzania,Both,2021,63.459646,5.844785e+07,Self-harm and interpersonal violence,6566.657691,11.235071,4329.648973,0.259491
136412,United Republic of Tanzania,Both,2021,63.459646,5.844785e+07,Neglected tropical diseases and malaria,16565.634626,28.342589,32838.703120,0.086308
136413,United Republic of Tanzania,Both,2021,63.459646,5.844785e+07,Chronic respiratory diseases,7014.036120,12.000503,7909.477791,0.151723
136414,United Republic of Tanzania,Both,2021,63.459646,5.844785e+07,Digestive diseases,15852.513919,27.122492,21340.502553,0.127094


In [40]:
death_data_countries.to_csv('../cleaned_data/death_data_countries.csv', index = False)

In [41]:
death_data_economic.to_csv('../cleaned_data/death_data_economic.csv', index = False)

In [42]:
pc_performance.to_csv('../cleaned_data/pc_performance.csv', index = False)

In [100]:
death_data_countries['location'].value_counts().reset_index().

Unnamed: 0,location,count
