# CS109 Project: Additional Data Cleaning
By Phillip Huang & Chris Chen

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import sklearn as sk
import pickle

from collections import Counter

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Part 1: Clean exercise data
The WHO provides two versions of the exercise data, with little description of the differences. Let's take a preliminary look.

In [2]:
exer = pd.read_csv('data/exercise/data.csv', skiprows = 1, na_values='No data')
exer_alt = pd.read_csv('data/exercise/alt.csv', skiprows = 1, na_values='No data')
exer.head()
exer_alt.head()
exer.values
exer_alt.values

Unnamed: 0,Country,Year,Age Group,Both sexes,Female,Male,Both sexes.1,Female.1,Male.1
0,Afghanistan,2010,18+ years,,,,,,
1,Albania,2010,18+ years,,,,,,
2,Algeria,2010,18+ years,34.4 [30.5-38.5],41.2 [35.7-46.9],27.7 [22.3-33.6],32.5 [28.6-36.6],39.4 [33.9-45.1],25.8 [20.4-31.7]
3,Andorra,2010,18+ years,26.1 [11.6-66.5],29.7 [8.3-66.1],22.4 [6.9-60.3],30.9 [15.6-72.6],34.7 [10.2-69.8],27.2 [8.9-65.3]
4,Angola,2010,18+ years,,,,,,


Unnamed: 0,Country,Year,Age Group,Both sexes,Female,Male,Both sexes.1,Female.1,Male.1
0,Afghanistan,2010,18+ years,,,,,,
1,Albania,2010,18+ years,,,,,,
2,Algeria,2010,18+ years,34.4 [30.5-38.5],41.2 [35.7-46.9],27.7 [22.3-33.6],32.5 [28.6-36.6],39.4 [33.9-45.1],25.8 [20.4-31.7]
3,Andorra,2010,18+ years,26.1 [11.6-66.5],29.7 [8.3-66.1],22.4 [6.9-60.3],30.9 [15.6-72.6],34.7 [10.2-69.8],27.2 [8.9-65.3]
4,Angola,2010,18+ years,,,,,,


array([['Afghanistan', 2010, '18+  years', ..., nan, nan, nan],
       ['Albania', 2010, '18+  years', ..., nan, nan, nan],
       ['Algeria', 2010, '18+  years', ..., '32.5 [28.6-36.6]',
        '39.4 [33.9-45.1]', '25.8 [20.4-31.7]'],
       ..., 
       ['Yemen', 2010, '18+  years', ..., nan, nan, nan],
       ['Zambia', 2010, '18+  years', ..., '17.7 [4.3-48.7]',
        '20.6 [5.1-52.9]', '14.7 [3.9-46.4]'],
       ['Zimbabwe', 2010, '18+  years', ..., '19.7 [4.9-51]',
        '23.8 [6.1-57.1]', '15.3 [4.1-45.5]']], dtype=object)

array([['Afghanistan', 2010, ' 18+  years', ..., nan, nan, nan],
       ['Albania', 2010, ' 18+  years', ..., nan, nan, nan],
       ['Algeria', 2010, ' 18+  years', ..., '32.5 [28.6-36.6]',
        '39.4 [33.9-45.1]', '25.8 [20.4-31.7]'],
       ..., 
       ['Yemen', 2010, ' 18+  years', ..., nan, nan, nan],
       ['Zambia', 2010, ' 18+  years', ..., '17.7 [4.3-48.7]',
        '20.6 [5.1-52.9]', '14.7 [3.9-46.4]'],
       ['Zimbabwe', 2010, ' 18+  years', ..., '19.7 [4.9-51]',
        '23.8 [6.1-57.1]', '15.3 [4.1-45.5]']], dtype=object)

Looks like no substantial difference, except the standard data seems a bit cleaner. We'll just use that one.

Now, we will see if all the countries match up with the dependent variable Risk/Deaths per 100k.

In [3]:
out = open('data/clean/risk.p', 'r')
risk = pickle.load(out)
risk.head()

Unnamed: 0_level_0,2012,2000
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,31,33
Albania,19,24
Algeria,22,23
Angola,24,24
Argentina,17,20


In [4]:
# check if all risk countries in exercise data
for country in risk.index:
    if country not in exer['Country'].tolist():
        print country

In [5]:
# check if all exercise countries in risk data
for country in exer['Country']:
    if country not in risk.index:
        print country

Andorra
Antigua and Barbuda
Cook Islands
Dominica
Grenada
Kiribati
Marshall Islands
Micronesia (Federated States of)
Monaco
Nauru
Niue
Palau
Saint Kitts and Nevis
Saint Lucia
Saint Vincent and the Grenadines
Samoa
San Marino
Sao Tome and Principe
Seychelles
Tonga
Tuvalu
Vanuatu


In [6]:
risk.index.tolist()

['Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 "Cote d'Ivoire",
 'Cabo Verde',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 "Democratic People's Republic of Korea",
 'Democratic Republic of the Congo',
 'Denmark',
 'Djibouti',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Hondu

All countries appear to be spelled the same way as in the risk data. Several countries appear to be excluded from the risk data. We will now drop them.

In [7]:
dropme = []

for country in exer['Country']:
    if country not in risk.index:
        dropme.append(country)
        
clean_exer = exer[~exer['Country'].isin(dropme)]

# check
for country in clean_exer['Country']:
    if country not in risk.index:
        print 'Fail'

clean_exer.head()

Unnamed: 0,Country,Year,Age Group,Both sexes,Female,Male,Both sexes.1,Female.1,Male.1
0,Afghanistan,2010,18+ years,,,,,,
1,Albania,2010,18+ years,,,,,,
2,Algeria,2010,18+ years,34.4 [30.5-38.5],41.2 [35.7-46.9],27.7 [22.3-33.6],32.5 [28.6-36.6],39.4 [33.9-45.1],25.8 [20.4-31.7]
4,Angola,2010,18+ years,,,,,,
6,Argentina,2010,18+ years,39.2 [13.5-76.6],42.7 [14.7-78.5],35.8 [13.2-76.1],40.1 [13.9-77],44.1 [15.6-79.3],35.7 [13.2-76]


The first "both sexes, female, male" columns are the age-standarized estimates, while the second set are crude estimates. We should use the age-standarized because they are *INSERT REASONING HERE*. We should also drop the Age Group column and Year column if they are the same for every row. Finally, we should drop the male and female column because we only have data in the dependent variables for both sexes.

In [8]:
clean_exer['Year'].value_counts()
clean_exer['Age Group'].value_counts()

2010    172
Name: Year, dtype: int64

18+  years    172
Name: Age Group, dtype: int64

In [9]:
exer = clean_exer[['Country', 'Both sexes']]
exer.head()

Unnamed: 0,Country,Both sexes
0,Afghanistan,
1,Albania,
2,Algeria,34.4 [30.5-38.5]
4,Angola,
6,Argentina,39.2 [13.5-76.6]


We can remove the confidence interval range from the values. If we need them in the future, we can grab them here independently.

In [10]:
def grab_num(full):
    if pd.isnull(full):
        return np.nan
    return float(full.split()[0])

exer = pd.DataFrame({
        'Country': exer['Country'], 
        'Percent': exer['Both sexes'].apply(grab_num)
    })
exer.head()

Unnamed: 0,Country,Percent
0,Afghanistan,
1,Albania,
2,Algeria,34.4
4,Angola,
6,Argentina,39.2


Finally, let's make the country the index.

In [11]:
exer = exer.set_index('Country')
exer.head()

Unnamed: 0_level_0,Percent
Country,Unnamed: 1_level_1
Afghanistan,
Albania,
Algeria,34.4
Angola,
Argentina,39.2


Looks good! We're done with cleaning here.

In [12]:
pickle.dump(exer, open('data/clean/exer.p', 'wb'))

### Part 2: Deal with missing data

Now we want to decide what to do with the NaNs. Let's first load in the rest of the data.

In [13]:
out = open('data/clean/deaths_100k.p', 'r')
deaths = pickle.load(out)
out = open('data/clean/crops.p', 'r')
crops = pickle.load(out)
out = open('data/clean/meat.p', 'r')
meat = pickle.load(out)
out = open('data/clean/doctors.p', 'r')
doctors = pickle.load(out)
out = open('data/clean/beds.p', 'r')
beds = pickle.load(out)

In [14]:
# see if entire df contains NaNs
for df in [('risk', risk), ('doctors', doctors), ('beds', beds), ('exer', exer)]:
    if df[1].isnull().any().any():
        print df[0].upper(), 'CONTAINS NaNs'
        df[1].head(2)

DOCTORS CONTAINS NaNs


Unnamed: 0_level_0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.034844,,,,,0.063428,,,,,...,0.146,0.145,0.175,0.194,0.234,0.225,0.266,,,
Angola,0.067068,,,,,0.076062,,,,,...,,,0.166,,,,,,,


BEDS CONTAINS NaNs


Unnamed: 0_level_0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Country_corrected,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.170627,,,,,,,,,,...,0.42,0.42,0.4,0.4,,0.5,,,,
Angola,2.061462,,,,,,,,,,...,,,,,,,,,,


EXER CONTAINS NaNs


Unnamed: 0_level_0,Percent
Country,Unnamed: 1_level_1
Afghanistan,
Albania,


In [15]:
for group in [('deaths', deaths), ('crops', crops), ('meat', meat)]:
    for df in group[1].values():
        if df.isnull().any().any():
            print group[0].upper(), 'CONTAINS NaNs'
            df.head(2)
            break

CROPS CONTAINS NaNs


Unnamed: 0_level_0,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bangladesh,0.04,0.03,0.04,0.04,0.04,0.03,0.04,0.04,0.04,0.03,...,0.04,0.04,0.04,0.04,0.04,0.04,0.05,0.05,0.04,0.05
Brazil,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.09,0.06,0.1,...,0.22,0.22,0.22,0.22,0.22,0.21,0.21,0.21,0.21,0.21


MEAT CONTAINS NaNs


Unnamed: 0_level_0,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Canada,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
Chile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


The dataframes DOCTORS, BEDS, and EXER contain NaNs. The dataframe groups CROPS and MEAT contain NaNs. Let's see if we can select a subset of data from each of these objects that is complete and simply drop the rest. Exercise is unusable, as we cannot ignore countries. We can check if any specific year has complete data for doctors and beds. For crops and meat, we can check if any crop or livestock items are individually complete, and then check each item for a specific year that's complete.

In [16]:
# check for complete years
def comp_yrs(df):
    answer = []
    for col in df:
        if df[col].isnull().any() == False:
            answer.append(col)
    return answer

In [17]:
# complete years in doctors, beds, exer
comp_count = 0
total_count = 0
for df in [('doctors', doctors), ('beds', beds), ('exer', exer)]:
    
    # increment total column count
    total_count += len(df[1].columns)
    result = comp_yrs(df[1])
    
    # print complete columns
    if result != []:
        for col in result:
            comp_count += 1
            print df[0], col

print 'Complete Columns in Doctors, Beds, Exercise: ', comp_count, '/', total_count

Complete Columns in Doctors, Beds, Exercise:  0 / 115


In [18]:
# complete dfs in crops, meat
comp_count = 0
total_count = 0
for group in [('crops', crops), ('meat', meat)]:
    for name, df in group[1].items():
        total_count += len(df.columns) 
        result = comp_yrs(df)
        if result != []:
            comp_count += len(result)
            print name, '\n', result, '\n'
            
print 'Complete Columns in Food Data: ', comp_count, '/', total_count

Ricebran Oil 
[1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011] 

Millet and products 
[1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011] 

Sesame seed 
[2006, 2007, 2008, 2009, 2010, 2011] 

Sugar non-centrifugal 
[1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011] 

Molasses 
[1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013] 

Sugar beet

The number of complete years is quite sparse throughout the datasets. Let's check how many complete rows there are. Perhaps dropping a few countries will make many more years complete.

In [19]:
nans_by_country = Counter()

In [20]:
# organize data into lists
dicts = [deaths, crops, meat]
dfs = [doctors, beds]

In [21]:
# count up the number of NaNs for each country in the df
def count_nans(df, counter):
    for row in df.iterrows():
        # sum the number of NaNs in the row and add to the 
        # corresponding country's NaN count
        country = row[0]
        nan_count = sum(row[1].isnull())
        counter[country] += nan_count

In [22]:
# get total NaN values across all dataframes
for dictionary in dicts:
    # get to the dataframe
    for df in dictionary.values():
        count_nans(df, nans_by_country)
        
for df in dfs:
    count_nans(df, nans_by_country)

nans_by_country.most_common()[:20]

[('Serbia', 4806),
 ('Montenegro', 4613),
 ('Belgium', 4268),
 ('Slovakia', 3650),
 ('Ethiopia', 3601),
 ('Czech Republic', 3559),
 ('The former Yugoslav republic of Macedonia', 3546),
 ('Slovenia', 3523),
 ('Kazakhstan', 3481),
 ('Russian Federation', 3469),
 ('Latvia', 3459),
 ('Republic of Moldova', 3447),
 ('Croatia', 3447),
 ('Lithuania', 3431),
 ('Estonia', 3431),
 ('Ukraine', 3425),
 ('Bosnia and Herzegovina', 3404),
 ('Belarus', 3393),
 ('Armenia', 3355),
 ('Georgia', 3347)]

That's a lot of NaNs. Let's now look at NaNs per country for each dataset, so we can get some granularity on which NaNs are more important.

In [23]:
# get countries with most nans for crops
nans_crops = Counter()
for df in crops.values():
    count_nans(df, nans_crops)

nans_crops = nans_crops.most_common()[:20]
nans_crops

[('Serbia', 3290),
 ('Montenegro', 3243),
 ('Belgium', 2993),
 ('Ethiopia', 2592),
 ('Slovakia', 2516),
 ('Czech Republic', 2482),
 ('Slovenia', 2475),
 ('The former Yugoslav republic of Macedonia', 2442),
 ('Kazakhstan', 2442),
 ('Russian Federation', 2409),
 ('Croatia', 2409),
 ('Lithuania', 2376),
 ('Republic of Moldova', 2376),
 ('Estonia', 2376),
 ('Latvia', 2376),
 ('Bosnia and Herzegovina', 2343),
 ('Armenia', 2343),
 ('Ukraine', 2343),
 ('Belarus', 2343),
 ('Georgia', 2343)]

In [24]:
# get countries with most nans for meat
nans_meat = Counter()
for df in meat.values():
    count_nans(df, nans_meat)

nans_meat = nans_meat.most_common()[:20]
nans_meat

[('Serbia', 1410),
 ('Montenegro', 1269),
 ('Belgium', 1230),
 ('Ukraine', 1023),
 ('Latvia', 1023),
 ('Slovakia', 1020),
 ('Czech Republic', 1020),
 ('Lithuania', 990),
 ('Slovenia', 990),
 ('Bosnia and Herzegovina', 990),
 ('Russian Federation', 990),
 ('Belarus', 990),
 ('The former Yugoslav republic of Macedonia', 990),
 ('Croatia', 990),
 ('Estonia', 990),
 ('Kazakhstan', 990),
 ('Azerbaijan', 957),
 ('Armenia', 957),
 ('Republic of Moldova', 957),
 ('Georgia', 957)]

In [25]:
# get countries with most nans for doctors
nans_doctors = Counter()
count_nans(doctors, nans_doctors)
nans_doctors = nans_doctors.most_common()[:20]
nans_doctors

[('United States of America', 57),
 ('Yemen', 57),
 ('Iran (Islamic Republic of)', 57),
 ('Viet Nam', 57),
 ('Slovakia', 57),
 ('Bolivia (Plurinational State of)', 57),
 ('The former Yugoslav republic of Macedonia', 57),
 ('Republic of Moldova', 57),
 ('Bahamas', 57),
 ("Lao People's Democratic Republic", 57),
 ('Gambia', 57),
 ('United Republic of Tanzania', 57),
 ('Venezuela (Bolivarian Republic of)', 57),
 ('Republic of Korea', 57),
 ('Democratic Republic of the Congo', 57),
 ('United Kingdom of Great Britain and Northern Ireland', 57),
 ('Kyrgyzstan', 57),
 ('South Sudan', 57),
 ('Egypt', 57),
 ("Democratic People's Republic of Korea", 57)]

In [26]:
# get countries with most nans for beds
nans_beds = Counter()
count_nans(beds, nans_beds)
nans_beds = nans_beds.most_common()[:20]
nans_beds

[('United States of America', 57),
 ('Yemen', 57),
 ('Iran (Islamic Republic of)', 57),
 ('Viet Nam', 57),
 ('Slovakia', 57),
 ('Bolivia (Plurinational State of)', 57),
 ('The former Yugoslav republic of Macedonia', 57),
 ('Republic of Moldova', 57),
 ('Bahamas', 57),
 ("Lao People's Democratic Republic", 57),
 ('Gambia', 57),
 ('United Republic of Tanzania', 57),
 ('Venezuela (Bolivarian Republic of)', 57),
 ('Republic of Korea', 57),
 ('Democratic Republic of the Congo', 57),
 ('United Kingdom of Great Britain and Northern Ireland', 57),
 ('Kyrgyzstan', 57),
 ('South Sudan', 57),
 ('Egypt', 57),
 ("Democratic People's Republic of Korea", 57)]

In [28]:
# get countries with most nans for exercise
nans_exer = Counter()
count_nans(exer, nans_exer)
nans_exer = nans_exer.most_common()[:20]
nans_exer

[('Turkmenistan', 1),
 ('Oman', 1),
 ('Yemen', 1),
 ('Albania', 1),
 ('Azerbaijan', 1),
 ('Tajikistan', 1),
 ('Afghanistan', 1),
 ('Syrian Arab Republic', 1),
 ('Somalia', 1),
 ('Peru', 1),
 ('Cuba', 1),
 ('Montenegro', 1),
 ('Armenia', 1),
 ('Timor-Leste', 1),
 ('Bolivia (Plurinational State of)', 1),
 ('Bahrain', 1),
 ('Belarus', 1),
 ('Angola', 1),
 ('Brunei Darussalam', 1),
 ('Uganda', 1)]

Since exercise, beds, and doctors are really only supplemental data to this project, we should prioritize eliminating the NaNs appearing in the food data. We can varying numbers of countries with the most NaNs in the food data, then check how those drops increase the number of complete columns. In essence, we exchange the number of data points for more predictors. By trying out different numbers of countries to drop, we can find the tradeoff interaction between these two quantities.

In [59]:
cols_vs_dropped_crops = pd.DataFrame()
cols_vs_dropped_crops['complete columns'] = pd.Series()
# get countries with most nans for crops
crops_clean = crops.copy()
counter = 1
# for each country with the most NaNs
for country, count in nans_crops:
    # track the number of complete columns
    complete_columns = 0
    # for each dataframe in the dictionary
    for key, df in crops_clean.items():
        if country in df.index:
            df = df.drop(country)
            crops_clean[key] = df
        # count the number of complete columns
        complete_columns += sum(pd.isnull(df).sum() == 0)
    cols_vs_dropped_crops.loc[counter] = complete_columns
    counter += 1

In [60]:
cols_vs_dropped_crops

Unnamed: 0,complete columns
1,320.0
2,379.0
3,890.0
4,899.0
5,901.0
6,973.0
7,973.0
8,1004.0
9,1006.0
10,1037.0


In [61]:
cols_vs_dropped_meat = pd.DataFrame()
cols_vs_dropped_meat['complete columns'] = pd.Series()
# get countries with most nans for crops
meat_clean = meat.copy()
counter = 1
# for each country with the most NaNs
for country, count in nans_meat:
    # track the number of complete columns
    complete_columns = 0
    # for each dataframe in the dictionary
    for key, df in meat_clean.items():
        if country in df.index:
            df = df.drop(country)
            meat_clean[key] = df
        # count the number of complete columns
        complete_columns += sum(pd.isnull(df).sum() == 0)
    cols_vs_dropped_meat.loc[counter] = complete_columns
    counter += 1

In [62]:
cols_vs_dropped_meat

Unnamed: 0,complete columns
1,122.0
2,122.0
3,332.0
4,332.0
5,363.0
6,363.0
7,364.0
8,364.0
9,364.0
10,364.0
