In [23]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt

## CS418 Data Science Final Project

### Names: Abhi Shah, Brian De Villa,  Katherine Misyutina, Matthew Jankowski

#### Dataset Topic:
Child Mortality for children under the age of 5. Compare different countries and diseases. Show differences between devloped and devloping countries.

#### Criteria:
In early April (exact date to be announced) you must submit your progress report. Your progress report must contain the following:

    1)An introduction part to your data:
        * Data spec: describe your data. Include the format and any assumptions about your data, size of the dataset
        * A link to your full data in downloadable form, you can keep your data on Google Drive, Box, DropBox, GitHub, or personal website
        * A sample of your data ( n = 10 - 50)
        * A report of your data collection process
            * How did you collect your data
            * How did you clean your data
            * Mention any difficulties you faced in the beginning steps
2) A summary of challenges and observations you have made so far. 
A brief mention of your next steps and what you plan to do with your data as you move into the analysis (If you are already in the analysis phase you can mention that as well)
Group member duties





## Cleaning GDP Dataset

Import the CSV here (CSV must be stored exactly where this .pnyb file is located)

In [24]:
df = pd.read_csv("gdp-per-capita-worldbank.csv")

# Prints the first 5 from the data
df

Unnamed: 0,Entity,Code,Year,GDP per capita (int.-$) (constant 2011 international $)
0,Afghanistan,AFG,2002,1063.635574
1,Afghanistan,AFG,2003,1099.194507
2,Afghanistan,AFG,2004,1062.249360
3,Afghanistan,AFG,2005,1136.123214
4,Afghanistan,AFG,2006,1161.124889
...,...,...,...,...
6402,Zimbabwe,ZWE,2013,1929.765001
6403,Zimbabwe,ZWE,2014,1925.138698
6404,Zimbabwe,ZWE,2015,1912.280261
6405,Zimbabwe,ZWE,2016,1879.628119


In [25]:
# Renaming Columns
df = df.rename(columns={"Entity": "Country", "Code": "Code", "Year": "Year", "GDP per capita (int.-$) (constant 2011 international $)" : "GDP per Capita"})

df

Unnamed: 0,Country,Code,Year,GDP per Capita
0,Afghanistan,AFG,2002,1063.635574
1,Afghanistan,AFG,2003,1099.194507
2,Afghanistan,AFG,2004,1062.249360
3,Afghanistan,AFG,2005,1136.123214
4,Afghanistan,AFG,2006,1161.124889
...,...,...,...,...
6402,Zimbabwe,ZWE,2013,1929.765001
6403,Zimbabwe,ZWE,2014,1925.138698
6404,Zimbabwe,ZWE,2015,1912.280261
6405,Zimbabwe,ZWE,2016,1879.628119


In [26]:
df = df.drop(columns=['Code'])
df

Unnamed: 0,Country,Year,GDP per Capita
0,Afghanistan,2002,1063.635574
1,Afghanistan,2003,1099.194507
2,Afghanistan,2004,1062.249360
3,Afghanistan,2005,1136.123214
4,Afghanistan,2006,1161.124889
...,...,...,...
6402,Zimbabwe,2013,1929.765001
6403,Zimbabwe,2014,1925.138698
6404,Zimbabwe,2015,1912.280261
6405,Zimbabwe,2016,1879.628119


In [21]:
# Saves Dataset (Change directory when needed)
# df.to_csv (r'C:\Users\DrNoodles\Desktop\GitHub_Stuff\CS-418-Project\pnyb_files\Dataset Cleaning\gdp_per_capita_clean.csv', index = False)


In [22]:
# Testing the Imported CSV
#df = pd.read_csv("gdp_per_capita_clean.csv")
#df

Unnamed: 0,Country,Year,GDP per Capita
0,Afghanistan,2002,1063.635574
1,Afghanistan,2003,1099.194507
2,Afghanistan,2004,1062.249360
3,Afghanistan,2005,1136.123214
4,Afghanistan,2006,1161.124889
...,...,...,...
6402,Zimbabwe,2013,1929.765001
6403,Zimbabwe,2014,1925.138698
6404,Zimbabwe,2015,1912.280261
6405,Zimbabwe,2016,1879.628119


In [35]:

def createDataFrameForYear(myYear):
    yearData = df['Year']==myYear                   # Observe specified year
    df2 = df.loc[yearData].copy()   
    return df2


# Focus years on 2000, 2005, 2010, 2015
year1 = createDataFrameForYear(2000)
year2 = createDataFrameForYear(2005)
year3 = createDataFrameForYear(2010)
year4 = createDataFrameForYear(2015)

year1andYear2 = pd.concat([year1,year2], ignore_index=True)
year3andYear4 = pd.concat([year3,year4], ignore_index=True)

# New Dataframe of Observed Years (Unsorted)
observedYearsUnsorted = pd.concat([year1andYear2, year3andYear4], ignore_index=True)

observedYears = observedYearsUnsorted.sort_values(by=['Country', 'Year'], ignore_index=True)
observedYears



Unnamed: 0,Country,Year,GDP per Capita
0,Afghanistan,2005,1136.123214
1,Afghanistan,2010,1614.255001
2,Afghanistan,2015,1809.016488
3,Albania,2000,5668.574777
4,Albania,2005,7733.006933
...,...,...,...
933,Zambia,2015,3627.202041
934,Zimbabwe,2000,2583.490990
935,Zimbabwe,2005,1662.485685
936,Zimbabwe,2010,1474.877128


In [89]:

# Check Observed Years for Specified Countries
def createDataFrameForCountry(myCountry):
    countryData = observedYears['Country']==myCountry                   # Observe specified year
    df2 = observedYears.loc[countryData].copy()   
    return df2

# Observed Countries
countryList = ['India', 'Philippines', 'Bangladesh', 'China', 'Saudi Arabia', 'Poland', 'Russia', 'Germany', 'Ukraine', 'Serbia', 'Albania', 'Ecuador', 'Colombia', 'Brazil', 'Chile', 'Uganda', 'Kenya', 'Ethiopia', 'Morocco', 'South Africa', 'Nigeria', 'Burundi', 'United States', 'Canada', 'Mexico', 'Dominican Republic', 'Guatemala', 'Haiti', 'Australia', 'New Zealand', 'Solomon Islands', 'Fiji']

# Create Empty Dataframe with same columns
observedCountriesDF = pd.DataFrame(columns=['Country', 'Year', 'GDP per Capita'])

# Iterate through the Country List and concat it to the empty Dataframe
for i in range(len(countryList)):
    tempDF = createDataFrameForCountry(countryList[i])
    observedCountriesDF = pd.concat([observedCountriesDF,tempDF], ignore_index=True)

# Sort the Data by Country and Year (Reseting the index)
observedCountriesDF = observedCountriesDF.sort_values(by=['Country', 'Year'], ignore_index=True)
observedCountriesDF


Unnamed: 0,Country,Year,GDP per Capita
0,Albania,2000,5668.574777
1,Albania,2005,7733.006933
2,Albania,2010,9927.181841
3,Albania,2015,10970.452245
4,Australia,2000,35377.729552
...,...,...,...
123,Ukraine,2015,7464.939834
124,United States,2000,45986.052710
125,United States,2005,49762.237901
126,United States,2010,49374.178885


In [91]:
# Save Curated Dataset
observedCountriesDF.to_csv (r'C:\Users\DrNoodles\Desktop\GitHub_Stuff\CS-418-Project\pnyb_files\Dataset Cleaning\gdp_per_capita_clean.csv', index = False)
