## Data Cleaning

Below CSV files cleaned and merged: 

* battle_deaths_by_country.csv
* gdp.csv
* POP.csv
* refugees_by_country_of_asylum.csv
* refugees_by_country_of_origin.csv

In [378]:
!ls

aid.csv
asylum_yr16.csv
battle_deaths_by_country.csv
battle_deaths_yr16.csv
countries.csv
data_cleaning_for_info_table.ipynb
gdp.csv
gdpClean.csv
origin_yr16.csv
POP.csv
population.csv
rawD
refugees_by_country_of_asylum.csv
refugees_by_country_of_origin.csv


In [379]:
import pandas as pd
import numpy as np

## Countries Data 

In [380]:
countries =  pd.read_csv("countries.csv", encoding='utf-8')
countries.head()

Unnamed: 0,0
0,Albania
1,Algeria
2,American Samoa
3,Andorra
4,Angola


## Population Data Clean Up

In [381]:
population =  pd.read_csv("POP.csv" ,encoding = "cp1252")
population.head()

Unnamed: 0.1,Unnamed: 0,Population 2016,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,,Ranking,,Economy,(thousands),,,,,,,
3,,,,,,,,,,,,
4,CHN,1,,China,1378665,,,,,,,


In [382]:
df = population.drop(['Unnamed: 0', 'Population 2016', 'Unnamed: 2', 'Unnamed: 5', 'Unnamed: 6', 
                 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'], 
                axis =1, inplace = True)

In [383]:
df = population.dropna()
df.head()

Unnamed: 0,Unnamed: 3,Unnamed: 4
2,Economy,(thousands)
4,China,1378665
5,India,1324171
6,United States,323128
7,Indonesia,261115


In [384]:
new_df = df[:-12]
new_df.head()

Unnamed: 0,Unnamed: 3,Unnamed: 4
2,Economy,(thousands)
4,China,1378665
5,India,1324171
6,United States,323128
7,Indonesia,261115


In [385]:
new_df = new_df.drop([2])

In [386]:
new_df.index = np.arange(0, len(new_df))

In [387]:
new_df.rename(columns={'Unnamed: 3': 'Country Name', 'Unnamed: 4': 'Population [YR2016]'}, inplace=True)
new_df.head()

Unnamed: 0,Country Name,Population [YR2016]
0,China,1378665
1,India,1324171
2,United States,323128
3,Indonesia,261115
4,Brazil,207653


In [388]:
new_df.to_csv("population.csv", encoding='utf-8', index=False)

In [389]:
read = pd.read_csv("population.csv")
read.head()

Unnamed: 0,Country Name,Population [YR2016]
0,China,1378665
1,India,1324171
2,United States,323128
3,Indonesia,261115
4,Brazil,207653


## GDP Data Clean Up

In [390]:
gdp =  pd.read_csv("gdp.csv" ,encoding = "cp1252")
gdp.head()

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2015 [YR2015]
0,GDP (current US$),NY.GDP.MKTP.CD,Afghanistan,AFG,19215562178.9798
1,GDP (current US$),NY.GDP.MKTP.CD,Albania,ALB,11335264966.561
2,GDP (current US$),NY.GDP.MKTP.CD,Algeria,DZA,165874330876.321
3,GDP (current US$),NY.GDP.MKTP.CD,American Samoa,ASM,659000000.0
4,GDP (current US$),NY.GDP.MKTP.CD,Andorra,AND,2811489408.89431


In [391]:
gdp_df = gdp.dropna()
gdp_df.head(1)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,2015 [YR2015]
0,GDP (current US$),NY.GDP.MKTP.CD,Afghanistan,AFG,19215562178.9798


In [392]:
gdp_df.drop(['Series Name', 'Series Code', 'Country Code'], axis =1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [393]:
gdp_df.head()

Unnamed: 0,Country Name,2015 [YR2015]
0,Afghanistan,19215562178.9798
1,Albania,11335264966.561
2,Algeria,165874330876.321
3,American Samoa,659000000.0
4,Andorra,2811489408.89431


In [394]:
gdp_df.rename(columns={'2015 [YR2015]': 'GDP [YR2015]'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [395]:
gdp_df.head()

Unnamed: 0,Country Name,GDP [YR2015]
0,Afghanistan,19215562178.9798
1,Albania,11335264966.561
2,Algeria,165874330876.321
3,American Samoa,659000000.0
4,Andorra,2811489408.89431


In [396]:
gdp_df.to_csv("gdpClean.csv", encoding='utf-8', index=False)

## Clean the Battle Data

In [397]:
!ls

aid.csv
asylum_yr16.csv
battle_deaths_by_country.csv
battle_deaths_yr16.csv
countries.csv
data_cleaning_for_info_table.ipynb
gdp.csv
gdpClean.csv
origin_yr16.csv
POP.csv
population.csv
rawD
refugees_by_country_of_asylum.csv
refugees_by_country_of_origin.csv


In [398]:
read_battle = pd.read_csv("battle_deaths_by_country.csv")
read_battle.head(1)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],1991 [YR1991],1992 [YR1992],1993 [YR1993],1994 [YR1994],1995 [YR1995],...,2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017]
0,Battle-related deaths (number of people),VC.BTL.DETH,Afghanistan,AFG,1478,3302,4276,4071,8937,5499,...,5552,6341,6864,7405,7719,8056,12285,17273,17980,..


In [399]:
read_battle.drop(['Series Name', 'Series Code', 'Country Code', '1990 [YR1990]', '1991 [YR1991]', 
                 '1992 [YR1992]', '1993 [YR1993]', '1994 [YR1994]', '1995 [YR1995]', '1996 [YR1996]',
                 '1997 [YR1997]', '1998 [YR1998]', '1999 [YR1999]', '2000 [YR2000]', '2001 [YR2001]', 
                 '2002 [YR2002]', '2003 [YR2003]', '2000 [YR2000]', '2001 [YR2001]', '2002 [YR2002]', 
                 '2003 [YR2003]', '2003 [YR2003]', '2004 [YR2004]', '2005 [YR2005]', '2006 [YR2006]',
                 '2007 [YR2007]', '2008 [YR2008]', '2009 [YR2009]', '2010 [YR2010]', '2011 [YR2011]',
                 '2012 [YR2012]', '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2017 [YR2017]'], 
                axis =1, inplace = True)

In [400]:
read_battle.head()

Unnamed: 0,Country Name,2016 [YR2016]
0,Afghanistan,17980
1,Albania,..
2,Algeria,86
3,American Samoa,..
4,Andorra,..


In [401]:
read_battle_df = read_battle.dropna()
read_battle_df.head()

Unnamed: 0,Country Name,2016 [YR2016]
0,Afghanistan,17980
1,Albania,..
2,Algeria,86
3,American Samoa,..
4,Andorra,..


In [402]:
read_battle_df.rename(columns={'2016 [YR2016]': 'Battle Deaths [YR2016]'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [403]:
read_battle_df = df.loc[0:216]
read_battle_df.head()

Unnamed: 0,Unnamed: 3,Unnamed: 4
2,Economy,(thousands)
4,China,1378665
5,India,1324171
6,United States,323128
7,Indonesia,261115


In [404]:
read_battle_df.to_csv("battle_deaths_yr16.csv", encoding='utf-8', index=False)

In [405]:
read_b_d_yr16 = pd.read_csv("battle_deaths_yr16.csv")
read_b_d_yr16.head()

Unnamed: 0,Unnamed: 3,Unnamed: 4
0,Economy,(thousands)
1,China,1378665
2,India,1324171
3,United States,323128
4,Indonesia,261115


## Clean Asylum Data

In [406]:
!ls

aid.csv
asylum_yr16.csv
battle_deaths_by_country.csv
battle_deaths_yr16.csv
countries.csv
data_cleaning_for_info_table.ipynb
gdp.csv
gdpClean.csv
origin_yr16.csv
POP.csv
population.csv
rawD
refugees_by_country_of_asylum.csv
refugees_by_country_of_origin.csv


In [407]:
read_asylum = pd.read_csv("refugees_by_country_of_asylum.csv")
read_asylum.head(1)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],1991 [YR1991],1992 [YR1992],1993 [YR1993],1994 [YR1994],1995 [YR1995],...,2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017]
0,Refugee population by country or territory of ...,SM.POP.REFG,Afghanistan,AFG,50,38,60025,32132,19131,19605,...,37,37,6434,3009,16187,16863,300423,257554,59770,..


In [408]:
read_asylum.drop(['Series Name', 'Series Code', 'Country Code', '1990 [YR1990]', '1991 [YR1991]', 
                 '1992 [YR1992]', '1993 [YR1993]', '1994 [YR1994]', '1995 [YR1995]', '1996 [YR1996]',
                 '1997 [YR1997]', '1998 [YR1998]', '1999 [YR1999]', '2000 [YR2000]', '2001 [YR2001]', 
                 '2002 [YR2002]', '2003 [YR2003]', '2000 [YR2000]', '2001 [YR2001]', '2002 [YR2002]', 
                 '2003 [YR2003]', '2003 [YR2003]', '2004 [YR2004]', '2005 [YR2005]', '2006 [YR2006]',
                 '2007 [YR2007]', '2008 [YR2008]', '2009 [YR2009]', '2010 [YR2010]', '2011 [YR2011]',
                 '2012 [YR2012]', '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2017 [YR2017]'], 
                axis =1, inplace = True)

In [409]:
read_asylum.head()

Unnamed: 0,Country Name,2016 [YR2016]
0,Afghanistan,59770
1,Albania,111
2,Algeria,94220
3,American Samoa,..
4,Andorra,..


In [410]:
read_asylum_df = read_battle.dropna()
read_asylum_df.head()

Unnamed: 0,Country Name,2016 [YR2016]
0,Afghanistan,17980
1,Albania,..
2,Algeria,86
3,American Samoa,..
4,Andorra,..


In [411]:
read_asylum_df.rename(columns={'2016 [YR2016]': 'Asylum [YR2016]'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [412]:
read_asylum_df = read_asylum_df.loc[0:216]
read_asylum_df.head()

Unnamed: 0,Country Name,Asylum [YR2016]
0,Afghanistan,17980
1,Albania,..
2,Algeria,86
3,American Samoa,..
4,Andorra,..


In [413]:
read_asylum_df.to_csv("asylum_yr16.csv", encoding='utf-8', index=False)

In [414]:
read_asylum_yr16 = pd.read_csv("asylum_yr16.csv")
read_asylum_yr16.head()

Unnamed: 0,Country Name,Asylum [YR2016]
0,Afghanistan,17980
1,Albania,..
2,Algeria,86
3,American Samoa,..
4,Andorra,..


## Clean Origin Data

In [415]:
!ls

aid.csv
asylum_yr16.csv
battle_deaths_by_country.csv
battle_deaths_yr16.csv
countries.csv
data_cleaning_for_info_table.ipynb
gdp.csv
gdpClean.csv
origin_yr16.csv
POP.csv
population.csv
rawD
refugees_by_country_of_asylum.csv
refugees_by_country_of_origin.csv


In [416]:
read_origin= pd.read_csv("refugees_by_country_of_origin.csv")
read_origin.head(1)

Unnamed: 0,Series Name,Series Code,Country Name,Country Code,1990 [YR1990],1991 [YR1991],1992 [YR1992],1993 [YR1993],1994 [YR1994],1995 [YR1995],...,2008 [YR2008],2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],2017 [YR2017]
0,Refugee population by country or territory of ...,SM.POP.REFG.OR,Afghanistan,AFG,6339095,6306301,4552153,3374576,2731169,2679133,...,2833128,2887123,3054709,2664436,2586152,2556502,2596270,2666305,2501410,..


In [417]:
read_origin.drop(['Series Name', 'Series Code', 'Country Code', '1990 [YR1990]', '1991 [YR1991]', 
                 '1992 [YR1992]', '1993 [YR1993]', '1994 [YR1994]', '1995 [YR1995]', '1996 [YR1996]',
                 '1997 [YR1997]', '1998 [YR1998]', '1999 [YR1999]', '2000 [YR2000]', '2001 [YR2001]', 
                 '2002 [YR2002]', '2003 [YR2003]', '2000 [YR2000]', '2001 [YR2001]', '2002 [YR2002]', 
                 '2003 [YR2003]', '2003 [YR2003]', '2004 [YR2004]', '2005 [YR2005]', '2006 [YR2006]',
                 '2007 [YR2007]', '2008 [YR2008]', '2009 [YR2009]', '2010 [YR2010]', '2011 [YR2011]',
                 '2012 [YR2012]', '2013 [YR2013]', '2014 [YR2014]', '2015 [YR2015]', '2017 [YR2017]'], 
                axis =1, inplace = True)

In [418]:
read_origin.head()

Unnamed: 0,Country Name,2016 [YR2016]
0,Afghanistan,2501410
1,Albania,11050
2,Algeria,3675
3,American Samoa,..
4,Andorra,..


In [419]:
read_origin_df = read_origin.dropna()
read_origin_df.head()

Unnamed: 0,Country Name,2016 [YR2016]
0,Afghanistan,2501410
1,Albania,11050
2,Algeria,3675
3,American Samoa,..
4,Andorra,..


In [420]:
read_origin_df.rename(columns={'2016 [YR2016]': 'Origin [YR2016]'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [421]:
read_origin_df = read_origin_df.loc[0:216]
read_origin_df.head()

Unnamed: 0,Country Name,Origin [YR2016]
0,Afghanistan,2501410
1,Albania,11050
2,Algeria,3675
3,American Samoa,..
4,Andorra,..


In [422]:
read_origin_df.to_csv("origin_yr16.csv", encoding='utf-8', index=False)

In [423]:
read_origin_yr16 = pd.read_csv("origin_yr16.csv")
read_origin_yr16.head()

Unnamed: 0,Country Name,Origin [YR2016]
0,Afghanistan,2501410
1,Albania,11050
2,Algeria,3675
3,American Samoa,..
4,Andorra,..


## Mergin Data
* GDP
* Population 
* Refugee Received 
* Refugee Origin

In [424]:
!ls

aid.csv
asylum_yr16.csv
battle_deaths_by_country.csv
battle_deaths_yr16.csv
countries.csv
data_cleaning_for_info_table.ipynb
gdp.csv
gdpClean.csv
origin_yr16.csv
POP.csv
population.csv
rawD
refugees_by_country_of_asylum.csv
refugees_by_country_of_origin.csv


In [425]:
read_origin = pd.read_csv("origin_yr16.csv")
read_origin.head()

Unnamed: 0,Country Name,Origin [YR2016]
0,Afghanistan,2501410
1,Albania,11050
2,Algeria,3675
3,American Samoa,..
4,Andorra,..


In [426]:
read_asylum = pd.read_csv("asylum_yr16.csv")
read_asylum.head()

Unnamed: 0,Country Name,Asylum [YR2016]
0,Afghanistan,17980
1,Albania,..
2,Algeria,86
3,American Samoa,..
4,Andorra,..


In [427]:
read_gdp = pd.read_csv("gdpClean.csv")
read_gdp.head()

Unnamed: 0,Country Name,GDP [YR2015]
0,Afghanistan,19215562178.9798
1,Albania,11335264966.561
2,Algeria,165874330876.321
3,American Samoa,659000000.0
4,Andorra,2811489408.89431


In [428]:
read_ppl = pd.read_csv("population.csv")
read_ppl.head()

Unnamed: 0,Country Name,Population [YR2016]
0,China,1378665
1,India,1324171
2,United States,323128
3,Indonesia,261115
4,Brazil,207653


In [429]:
country_info = pd.merge(read_gdp, read_ppl, on='Country Name')
country_info.head()

Unnamed: 0,Country Name,GDP [YR2015],Population [YR2016]
0,Afghanistan,19215562178.9798,34656
1,Albania,11335264966.561,2876
2,Algeria,165874330876.321,40606
3,American Samoa,659000000.0,56
4,Andorra,2811489408.89431,77


In [431]:
country_info_asylum = pd.merge(country_info, read_asylum, on='Country Name')
country_info_asylum.head()

Unnamed: 0,Country Name,GDP [YR2015],Population [YR2016],Asylum [YR2016]
0,Afghanistan,19215562178.9798,34656,17980
1,Albania,11335264966.561,2876,..
2,Algeria,165874330876.321,40606,86
3,American Samoa,659000000.0,56,..
4,Andorra,2811489408.89431,77,..


In [433]:
country_info_asylum_origin = pd.merge(country_info_asylum, read_origin, on='Country Name')
country_info_asylum_origin.head()

Unnamed: 0,Country Name,GDP [YR2015],Population [YR2016],Asylum [YR2016],Origin [YR2016]
0,Afghanistan,19215562178.9798,34656,17980,2501410
1,Albania,11335264966.561,2876,..,11050
2,Algeria,165874330876.321,40606,86,3675
3,American Samoa,659000000.0,56,..,..
4,Andorra,2811489408.89431,77,..,..


In [434]:
country_info_asylum_origin.to_csv("country_info.csv", encoding='utf-8', index=False)

In [435]:
read_country = pd.read_csv("country_info.csv")
read_country.head()

Unnamed: 0,Country Name,GDP [YR2015],Population [YR2016],Asylum [YR2016],Origin [YR2016]
0,Afghanistan,19215562178.9798,34656,17980,2501410
1,Albania,11335264966.561,2876,..,11050
2,Algeria,165874330876.321,40606,86,3675
3,American Samoa,659000000.0,56,..,..
4,Andorra,2811489408.89431,77,..,..
