In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import listdir
from os.path import isfile, join
from scipy import interpolate

In [2]:
mypath = 'Data'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [3]:
onlyfiles

['smoking-deaths-by-age.csv',
 'share-of-men-who-are-smoking.csv',
 'number-of-deaths-from-tobacco-smoking.csv',
 'number-of-total-daily-smokers.csv',
 'consumption-per-smoker-per-day.csv',
 'share-of-women-who-are-smoking.csv',
 'average-price-of-a-pack-of-cigarettes.csv']

In [4]:
dfs = [pd.read_csv("Data/" + x) for x in onlyfiles]

In [5]:
for df in dfs:
    print(df.columns.values)

['Entity' 'Code' 'Year' '70+ years old (deaths)'
 '15-49 years old (deaths)' '50-69 years old (deaths)']
['Entity' 'Code' 'Year' ' (% of adults)']
['Entity' 'Code' 'Year' 'Tobacco smoking']
['Entity' 'Code' 'Year'
 'Number of daily smokers - both (IHME, GHDx (2012))']
['Entity' 'Code' 'Year'
 'Cigarette consumption per smoker per day (cigarettes)']
['Entity' 'Code' 'Year' ' (% of adults)']
['Entity' 'Code' 'Year'
 'Indicator:Average -  cigarette price in international dollars (international-$)']


In [6]:
df_new = pd.DataFrame(columns= ['Entity','Code',"Year"])
for df in dfs:
    df_new = pd.merge(df_new,df,how = 'outer', on =['Entity','Code','Year'])

In [7]:
df_new

Unnamed: 0,Entity,Code,Year,70+ years old (deaths),15-49 years old (deaths),50-69 years old (deaths),(% of adults)_x,Tobacco smoking,"Number of daily smokers - both (IHME, GHDx (2012))",Cigarette consumption per smoker per day (cigarettes),(% of adults)_y,Indicator:Average - cigarette price in international dollars (international-$)
0,Afghanistan,AFG,1990,1246.238030,2053.685070,3497.160641,,6797.0,652030.0,5.800000,,
1,Afghanistan,AFG,1995,1974.935625,3094.630541,5519.810761,,10589.0,1028533.0,4.500000,,
2,Afghanistan,AFG,2000,2429.371353,3757.903014,6799.480998,,12987.0,1228925.0,3.700000,,
3,Afghanistan,AFG,2005,2848.213811,4537.789004,8483.815984,,15870.0,1504433.0,4.000000,,
4,Afghanistan,AFG,2006,2951.940806,4653.880863,8730.749271,,16337.0,1553745.0,4.100000,,
5,Afghanistan,AFG,2010,3306.430594,4953.975423,9416.096613,,17677.0,1743624.0,4.900000,,
6,Afghanistan,AFG,2016,4057.745075,5888.611941,11005.558639,,20952.0,,,,
7,Albania,ALB,1990,1118.952520,195.664903,976.852299,,2291.0,482850.0,16.400000,,
8,Albania,ALB,1995,1210.307920,207.313356,1059.088298,,2477.0,483593.0,16.100000,,
9,Albania,ALB,2000,1471.491695,288.469209,1164.872762,56.9,2925.0,461078.0,16.799999,11.6,


In [36]:
#trying to rename some of the columns
rename_df = df_new.rename(index= str, columns= {"Tobacco smoking":"Tobacco Smoking Deaths", 
                                                'Number of daily smokers - both (IHME, GHDx (2012))':'Number of Daily Smokers',
                                                'Cigarette consumption per smoker per day (cigarettes)': 'Cigarette Consumption per smoker per day'} )
rename_df

Unnamed: 0,Entity,Code,Year,70+ years old (deaths),15-49 years old (deaths),50-69 years old (deaths),(% of adults)_x,Tobacco Smoking Deaths,Number of Daily Smokers,Cigarette Consumption per smoker per day,(% of adults)_y,Indicator:Average - cigarette price in international dollars (international-$)
0,Afghanistan,AFG,1990,1246.238030,2053.685070,3497.160641,,6797.0,652030.0,5.800000,,
1,Afghanistan,AFG,1995,1974.935625,3094.630541,5519.810761,,10589.0,1028533.0,4.500000,,
2,Afghanistan,AFG,2000,2429.371353,3757.903014,6799.480998,,12987.0,1228925.0,3.700000,,
3,Afghanistan,AFG,2005,2848.213811,4537.789004,8483.815984,,15870.0,1504433.0,4.000000,,
4,Afghanistan,AFG,2006,2951.940806,4653.880863,8730.749271,,16337.0,1553745.0,4.100000,,
5,Afghanistan,AFG,2010,3306.430594,4953.975423,9416.096613,,17677.0,1743624.0,4.900000,,
6,Afghanistan,AFG,2016,4057.745075,5888.611941,11005.558639,,20952.0,,,,
7,Albania,ALB,1990,1118.952520,195.664903,976.852299,,2291.0,482850.0,16.400000,,
8,Albania,ALB,1995,1210.307920,207.313356,1059.088298,,2477.0,483593.0,16.100000,,
9,Albania,ALB,2000,1471.491695,288.469209,1164.872762,56.9,2925.0,461078.0,16.799999,11.6,


In [37]:
#tells us how many missing values there are per columnr
rename_df.isnull().sum()

Entity                                                                                0
Code                                                                                419
Year                                                                                  0
70+ years old (deaths)                                                             5519
15-49 years old (deaths)                                                           5519
50-69 years old (deaths)                                                           5519
 (% of adults)_x                                                                   6286
Tobacco Smoking Deaths                                                             5519
Number of Daily Smokers                                                             876
Cigarette Consumption per smoker per day                                            876
 (% of adults)_y                                                                   6271
Indicator:Average -  cigarette p

In [38]:
#gives the breakdown of the countries
rename_df['Entity'].value_counts()

China                                            36
Ethiopia                                         36
Chile                                            36
Serbia                                           36
Hungary                                          36
Turkey                                           36
Uganda                                           36
Ghana                                            36
Cambodia                                         36
Romania                                          36
Egypt                                            36
Honduras                                         36
Senegal                                          36
Singapore                                        36
Costa Rica                                       36
Pakistan                                         36
Dominican Republic                               36
Bosnia and Herzegovina                           36
France                                           36
Comoros     

In [11]:
##I think our best bet is to try and manually filter the dataset to select the values we want, by combing we skew our data
#and make it hard to decifer

In [39]:
#shows us the rows where we are missing values
rename_df[rename_df['Tobacco Smoking Deaths'].notnull()]

Unnamed: 0,Entity,Code,Year,70+ years old (deaths),15-49 years old (deaths),50-69 years old (deaths),(% of adults)_x,Tobacco Smoking Deaths,Number of Daily Smokers,Cigarette Consumption per smoker per day,(% of adults)_y,Indicator:Average - cigarette price in international dollars (international-$)
0,Afghanistan,AFG,1990,1246.238030,2053.685070,3497.160641,,6797.0,652030.0,5.800000,,
1,Afghanistan,AFG,1995,1974.935625,3094.630541,5519.810761,,10589.0,1028533.0,4.500000,,
2,Afghanistan,AFG,2000,2429.371353,3757.903014,6799.480998,,12987.0,1228925.0,3.700000,,
3,Afghanistan,AFG,2005,2848.213811,4537.789004,8483.815984,,15870.0,1504433.0,4.000000,,
4,Afghanistan,AFG,2006,2951.940806,4653.880863,8730.749271,,16337.0,1553745.0,4.100000,,
5,Afghanistan,AFG,2010,3306.430594,4953.975423,9416.096613,,17677.0,1743624.0,4.900000,,
6,Afghanistan,AFG,2016,4057.745075,5888.611941,11005.558639,,20952.0,,,,
7,Albania,ALB,1990,1118.952520,195.664903,976.852299,,2291.0,482850.0,16.400000,,
8,Albania,ALB,1995,1210.307920,207.313356,1059.088298,,2477.0,483593.0,16.100000,,
9,Albania,ALB,2000,1471.491695,288.469209,1164.872762,56.9,2925.0,461078.0,16.799999,11.6,


In [40]:
tobaccosmoke  = rename_df[['Tobacco Smoking Deaths','Entity','Year']].dropna()

In [None]:
#could pass dropna a subset of columns to drop based on if they are null
#by changing how = all or how = any can dicatate if we want to drop rows based on if all of the columns are null or just 1
df_new.dropna(subset= ['70+ years old (deaths)','15-49 years old (deaths)','50-69 years old (deaths)','Cigarette consumption per smoker per day (cigarettes)'],how = 'all')