In [1]:
from flask import Flask, render_template, jsonify, redirect
import pymongo
from flask_pymongo import PyMongo
from pymongo import MongoClient
import numpy as np
import pandas as pd
import datetime as dt
import pandas as pd
from pandas import read_html
import json
import pprint

# Reflect Tables into SQLAlchemy ORM

In [2]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
import sqlite3

# Connecting to the relational database
## Source: sqlite database from Kaggle Website

In [3]:
# Path to sqlite
database_path = "../Data/wdi_kaggle.sqlite"
engine = create_engine(f"sqlite:///{database_path}")
conn=engine.connect()

#get table names from database
for table_name in inspect(engine).get_table_names():
   print(table_name)

Country
CountryNotes
Footnotes
Indicators
Series
SeriesNotes


## Tables and exporting them to a Pandas DataFrame

In [4]:
Country_df=pd.read_sql('SELECT CountryCode, Region, IncomeGroup FROM Country',conn)
Indicators_df=pd.read_sql('SELECT * FROM Indicators',conn)
Series_df=pd.read_sql('SELECT SeriesCode, Topic, LongDefinition, AggregationMethod, LimitationsAndExceptions, Source, StatisticalConceptAndMethodology FROM Series',conn)

#### We realized that there are two codes (IndicatorCode in Indicator table and SeriesCode in Series table). We needed to confirm that these two codes are exactly the same and that there is no difference between them (i.e., diff_Ind_Series is Null), then we merge Series and Indicator tables based on this common column.

In [5]:
#find number of indicator and series codes
Indicators_df["IndicatorCode"].nunique()
Series_df["SeriesCode"].nunique()

1345

In [6]:
#confirm that there are no differences between indicator and series codes from both tables
series = set(Series_df.SeriesCode)
diff_Ind_Series = [x for x in Indicators_df.IndicatorCode if x not in series]
diff_Ind_Series

[]

### Now, we merge three DataFrames

In [7]:
Ind_Country=Indicators_df.merge(Country_df, left_on='CountryCode', right_on='CountryCode')

In [8]:
Ind_Country_Series=Ind_Country.merge(Series_df, left_on='IndicatorCode', right_on='SeriesCode')

####### Other option: Indictors = engine.execute('SELECT * FROM Indicators join Country on Indicators.CountryCode=Country.CountryCode').fetchall()

In [9]:
Ind_Country_Series.drop(['SeriesCode'],axis=1)

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value,Region,IncomeGroup,Topic,LongDefinition,AggregationMethod,LimitationsAndExceptions,Source,StatisticalConceptAndMethodology
0,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,1.335609e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...
1,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1961,1.341644e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...
2,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1962,1.348610e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...
3,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1963,1.345048e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...
4,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1964,1.341035e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...
5,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1965,1.335682e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...
6,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1966,1.326774e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...
7,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1967,1.316725e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...
8,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1968,1.292034e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...
9,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1969,1.267538e+02,,,Health: Reproductive health,Adolescent fertility rate is the number of bir...,Weighted average,,"United Nations Population Division, World Popu...",Reproductive health is a state of physical and...


## Move DataFrame to Mongo DB

#### First, we tried to directly send the dataframe as a dictionary to the Mongodb. However, we faced the memory issure (MemoryError below). So, we decided to turn the original Pandas Dataframe into 'ns' chuncks and feed them to the Mongodb

In [10]:
#change pandas dataframe to dictionary
Ind_Country_Series.to_dict("records")

[{'CountryName': 'Arab World',
  'CountryCode': 'ARB',
  'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)',
  'IndicatorCode': 'SP.ADO.TFRT',
  'Year': 1960,
  'Value': 133.56090740552298,
  'Region': '',
  'IncomeGroup': '',
  'SeriesCode': 'SP.ADO.TFRT',
  'Topic': 'Health: Reproductive health',
  'LongDefinition': 'Adolescent fertility rate is the number of births per 1,000 women ages 15-19.',
  'AggregationMethod': 'Weighted average',
  'LimitationsAndExceptions': '',
  'Source': 'United Nations Population Division, World Population Prospects.',
  'StatisticalConceptAndMethodology': 'Reproductive health is a state of physical and mental well-being in relation to the reproductive system and its functions and processes. Means of achieving reproductive health include education and services during pregnancy and childbirth, safe and effective contraception, and prevention and treatment of sexually transmitted diseases. Complications of pregnancy and childb

In [11]:
#connect to mongodb database

#app = Flask(__name__)
#mongo = PyMongo(app, uri="mongodb://localhost:27017/WDI")

client = MongoClient('mongodb://localhost:27017/')
dbmongo = client.World_Development_Indicator

In [12]:
fn=0
ln=len(Ind_Country_Series)

In [13]:
ln

5656458

In [14]:
# import to Mongo DB in chunks 

Ind_Country_Series_section=Ind_Country_Series[fn:ln]
nc=100

def chunk(df,x):
    return [ df[i::x] for i in range(x) ]
 
chunks = chunk(Ind_Country_Series_section, nc)

In [15]:
chunks

[                                            CountryName CountryCode  \
 0                                            Arab World         ARB   
 100                              Caribbean small states         CSS   
 200             East Asia & Pacific (all income levels)         EAS   
 300                                           Euro area         EMU   
 400             Europe & Central Asia (developing only)         ECA   
 500            Fragile and conflict affected situations         FCS   
 600              Heavily indebted poor countries (HIPC)         HPC   
 700                                High income: nonOECD         NOC   
 800       Latin America & Caribbean (all income levels)         LCN   
 900        Least developed countries: UN classification         LDC   
 1000                                         Low income         LIC   
 1100     Middle East & North Africa (all income levels)         MEA   
 1200       Middle East & North Africa (developing only)        

In [16]:
col=dbmongo['WDI_general']

#b=col.insert_many(chunks[x].to_dict(orient='records') for x in range(nc))
for count,x in enumerate(range(nc)):
    a=chunks[x].to_dict(orient='records') 
    col.insert_many(a)
    print(f"chunk={count}")

chunk=0
chunk=1
chunk=2
chunk=3
chunk=4
chunk=5
chunk=6
chunk=7
chunk=8
chunk=9
chunk=10
chunk=11
chunk=12
chunk=13
chunk=14
chunk=15
chunk=16
chunk=17
chunk=18
chunk=19
chunk=20
chunk=21
chunk=22
chunk=23
chunk=24
chunk=25
chunk=26
chunk=27
chunk=28
chunk=29
chunk=30
chunk=31
chunk=32
chunk=33
chunk=34
chunk=35
chunk=36
chunk=37
chunk=38
chunk=39
chunk=40
chunk=41
chunk=42
chunk=43
chunk=44
chunk=45
chunk=46
chunk=47
chunk=48
chunk=49
chunk=50
chunk=51
chunk=52
chunk=53
chunk=54
chunk=55
chunk=56
chunk=57
chunk=58
chunk=59
chunk=60
chunk=61
chunk=62
chunk=63
chunk=64
chunk=65
chunk=66
chunk=67
chunk=68
chunk=69
chunk=70
chunk=71
chunk=72
chunk=73
chunk=74
chunk=75
chunk=76
chunk=77
chunk=78
chunk=79
chunk=80
chunk=81
chunk=82
chunk=83
chunk=84
chunk=85
chunk=86
chunk=87
chunk=88
chunk=89
chunk=90
chunk=91
chunk=92
chunk=93
chunk=94
chunk=95
chunk=96
chunk=97
chunk=98
chunk=99


### At this point, we concluded that thte Jupyter notebook cannot export the very large size dataframe into the Mongodb. Rather, we started transferring the whole code into a explicit Python file (Scrape_WDI.py).

## Extracting and Transforming from second Data Source

In [17]:
# Storing filepath in a variable
second_data = "../Data/API_EN.csv"

In [18]:
# Reading the data
second_data_df = pd.read_csv(second_data, skiprows=4)
second_data_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
0,Aruba,ABW,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,24.670529,24.505835,13.155542,8.351294,8.408363,,,,,
1,Afghanistan,AFG,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.04606,0.053604,0.073765,0.074233,0.086292,0.101467,...,0.293837,0.412017,0.350371,0.315602,0.299445,,,,,
2,Angola,AGO,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.097472,0.079038,0.201289,0.192535,0.201003,0.191528,...,1.243406,1.252789,1.330843,1.254617,1.291328,,,,,
3,Albania,ALB,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,1.258195,1.374186,1.439956,1.181681,1.111742,1.166099,...,1.578574,1.803715,1.692908,1.749211,1.978763,,,,,
4,Andorra,AND,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,6.122595,5.86713,5.916597,5.900753,5.83217,,,,,


In [19]:
# Dropping columns
new_table_df = second_data_df.drop(columns=['Unnamed: 63'])
new_table_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Aruba,ABW,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,25.915833,24.670529,24.505835,13.155542,8.351294,8.408363,,,,
1,Afghanistan,AFG,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.04606,0.053604,0.073765,0.074233,0.086292,0.101467,...,0.241723,0.293837,0.412017,0.350371,0.315602,0.299445,,,,
2,Angola,AGO,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.097472,0.079038,0.201289,0.192535,0.201003,0.191528,...,1.232495,1.243406,1.252789,1.330843,1.254617,1.291328,,,,
3,Albania,ALB,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,1.258195,1.374186,1.439956,1.181681,1.111742,1.166099,...,1.4956,1.578574,1.803715,1.692908,1.749211,1.978763,,,,
4,Andorra,AND,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,6.121652,6.122595,5.86713,5.916597,5.900753,5.83217,,,,


In [20]:
new_table_df.columns

Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018'],
      dtype='object')

In [21]:
# Merging tables
result_one = pd.merge(Country_df, new_table_df, left_on='CountryCode', right_on="Country Code")
result_one.drop(columns="Country Code")

Unnamed: 0,CountryCode,Region,IncomeGroup,Country Name,Indicator Name,Indicator Code,1960,1961,1962,1963,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,AFG,South Asia,Low income,Afghanistan,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.046060,0.053604,0.073765,0.074233,...,0.241723,0.293837,0.412017,0.350371,0.315602,0.299445,,,,
1,ALB,Europe & Central Asia,Upper middle income,Albania,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,1.258195,1.374186,1.439956,1.181681,...,1.495600,1.578574,1.803715,1.692908,1.749211,1.978763,,,,
2,DZA,Middle East & North Africa,Upper middle income,Algeria,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.553764,0.531810,0.484954,0.452824,...,3.423011,3.299704,3.291376,3.460266,3.507310,3.717410,,,,
3,ASM,East Asia & Pacific,Upper middle income,American Samoa,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,...,,,,,,,,,,
4,AGO,Sub-Saharan Africa,Upper middle income,Angola,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.097472,0.079038,0.201289,0.192535,...,1.232495,1.243406,1.252789,1.330843,1.254617,1.291328,,,,
5,ATG,Latin America & Caribbean,High income: nonOECD,Antigua and Barbuda,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.662643,0.849084,1.796794,1.446821,...,5.446757,5.539568,5.363407,5.418447,5.360453,5.377649,,,,
6,ARB,,,Arab World,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.643689,0.685151,0.760855,0.874941,...,4.542151,4.615758,4.537755,4.813631,4.650474,4.860234,,,,
7,ARG,Latin America & Caribbean,High income: nonOECD,Argentina,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,2.367473,2.442616,2.522392,2.316356,...,4.410890,4.558500,4.600291,4.569384,4.462904,4.746797,,,,
8,ARM,Europe & Central Asia,Lower middle income,Armenia,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,...,1.509412,1.465622,1.710071,1.976060,1.899712,1.902759,,,,
9,ABW,Latin America & Caribbean,High income: nonOECD,Aruba,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,...,25.915833,24.670529,24.505835,13.155542,8.351294,8.408363,,,,


In [22]:
# Saving the result in json
import json
result_one_dict = json.loads(result_one.to_json()).values()

In [23]:
result_one_dict

dict_values([{'0': 'AFG', '1': 'ALB', '2': 'DZA', '3': 'ASM', '4': 'AGO', '5': 'ATG', '6': 'ARB', '7': 'ARG', '8': 'ARM', '9': 'ABW', '10': 'AUS', '11': 'AUT', '12': 'AZE', '13': 'BHR', '14': 'BGD', '15': 'BRB', '16': 'BLR', '17': 'BEL', '18': 'BLZ', '19': 'BEN', '20': 'BMU', '21': 'BTN', '22': 'BOL', '23': 'BIH', '24': 'BWA', '25': 'BRA', '26': 'BRN', '27': 'BGR', '28': 'BFA', '29': 'BDI', '30': 'CPV', '31': 'KHM', '32': 'CMR', '33': 'CAN', '34': 'CSS', '35': 'CYM', '36': 'CAF', '37': 'CEB', '38': 'TCD', '39': 'CHI', '40': 'CHL', '41': 'CHN', '42': 'COL', '43': 'COM', '44': 'COG', '45': 'CRI', '46': 'CIV', '47': 'HRV', '48': 'CUB', '49': 'CUW', '50': 'CYP', '51': 'CZE', '52': 'PRK', '53': 'DNK', '54': 'DJI', '55': 'DMA', '56': 'DOM', '57': 'EAS', '58': 'EAP', '59': 'ECU', '60': 'EGY', '61': 'SLV', '62': 'GNQ', '63': 'ERI', '64': 'EST', '65': 'ETH', '66': 'EMU', '67': 'ECS', '68': 'ECA', '69': 'EUU', '70': 'FRO', '71': 'FJI', '72': 'FIN', '73': 'FCS', '74': 'FRA', '75': 'PYF', '76': 'G

## Loading second data in MongoDB

In [24]:
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [25]:
# Defining the database  and in Mongo
db = client.World_Development_Indicator
# Declaring the collection
carbon_dioxide_col = db.carbon_dioxide

In [26]:
# Inserting data
carbon_dioxide_col.insert_many(result_one.to_dict('records'))
client.close()

## Loading third data in MongoDB from csv. 
#### Original data was scraped from two tables in WDI and transformed using Pandas and finally loaded in MongoDB. 

### Scrape  websites for data

In [27]:
# Scrape from WDI website for global unemployment data

url="http://wdi.worldbank.org/table/2.5#"
tables = pd.read_html(url)
print(f"There are {len(tables)} dataframes in the tables")

There are 3 dataframes in the tables


In [28]:
# Check there dataframes, the first one is empty
tables[0]

Unnamed: 0,0,1,2,3,4
0,,,,World Development Indicators:,


In [29]:
# The second one 
tables[1]

Unnamed: 0_level_0,Unnamed: 0_level_0,"Unemployment, male (% of male labor force) (modeled ILO estimate)","Unemployment, male (% of male labor force) (modeled ILO estimate)","Unemployment, female (% of female labor force) (modeled ILO estimate)","Unemployment, female (% of female labor force) (modeled ILO estimate)","Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)","Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)","Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)","Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)",Unemployment with basic education (% of total labor force with basic education),Unemployment with intermediate education (% of total labor force with intermediate education),Unemployment with advanced education (% of total labor force with advanced education)
Unnamed: 0_level_1,Unnamed: 0_level_1.1,2000,2017,2000,2017,2000,2017,2000,2017,2014-17,2014-17,2014-17


In [31]:
# We will use tables[1] as dataframe header
columns = [list(i) for i in list(tables[1])]
columns

[['Unnamed: 0_level_0', 'Unnamed: 0_level_1'],
 ['Unemployment, male (% of male labor force) (modeled ILO estimate)', '2000'],
 ['Unemployment, male (% of male labor force) (modeled ILO estimate)', '2017'],
 ['Unemployment, female (% of female labor force) (modeled ILO estimate)',
  '2000'],
 ['Unemployment, female (% of female labor force) (modeled ILO estimate)',
  '2017'],
 ['Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)',
  '2000'],
 ['Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate)',
  '2017'],
 ['Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)',
  '2000'],
 ['Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate)',
  '2017'],
 ['Unemployment with basic education (% of total labor force with basic education)',
  '2014-17'],
 ['Unemployment with intermediate education (% of total labor force with intermediate education)',
  '2014-17'],
 ['Un

In [32]:
# Moderate the columns
columns[0] ="country name"
for i in range(1,len(columns)):
    columns[i] = ",".join(columns[i])

In [33]:
# The third dataframe
tables[2].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,Afghanistan,3.8,1.1,3.0,2.5,5.6,2.1,5.2,3.8,12.3,16.2,15.5
1,Albania,17.4,14.6,18.2,12.6,30.0,33.8,26.6,27.5,13.8,20.4,19.1
2,Algeria,29.8,10.0,29.7,21.1,51.3,26.6,47.1,45.5,..,..,..
3,American Samoa,..,..,..,..,..,..,..,..,..,..,..
4,Andorra,..,..,..,..,..,..,..,..,..,..,..


In [34]:
# Create unemplyment_df 
unemplyment_df= tables[2]
unemplyment_df.columns=columns

In [35]:
# Scrape table(Education Input) from WDI website
url = "http://wdi.worldbank.org/table/2.7"
tables = pd.read_html(url)
print(f"There are {len(tables)} dataframes in the tables")

There are 3 dataframes in the tables


In [36]:
# The first dataset is empty
tables[0]

Unnamed: 0,0,1,2,3,4
0,,,,World Development Indicators:,


In [37]:
# The second dataset 
tables[1] 

Unnamed: 0_level_0,Unnamed: 0_level_0,Government expenditure per student,Government expenditure per student,Government expenditure per student,Government expenditure per student,Government expenditure per student,Government expenditure per student,Government expenditure on education,Government expenditure on education,Trained teachers,Trained teachers,Pupil-teacher ratio,Pupil-teacher ratio
Unnamed: 0_level_1,Unnamed: 0_level_1,Primary,Primary,Secondary,Secondary,Tertiary,Tertiary,Unnamed: 7_level_1,Unnamed: 8_level_1,Primary,Secondary,Primary,Secondary
Unnamed: 0_level_2,Unnamed: 0_level_2,% of GDP per capita,% of GDP per capita,% of GDP per capita,% of GDP per capita,% of GDP per capita,% of GDP per capita,% of GDP,% of total government expenditure,% of total,% of total,pupils per teacher,pupils per teacher
Unnamed: 0_level_3,Unnamed: 0_level_3.1,1999,2017,1999,2017,1999,2017,2017,2017,2017,2017,2017,2017


In [40]:
# We will use tables[1] as dataframe header
columns = [list(i) for i in list(tables[1])]


In [41]:
# Modefiled the columns
columns[0] ="country name"
columns[7].remove(columns[7][1])
columns[8].remove(columns[8][1])
for i in range(1,len(columns)):
    columns[i] = ",".join(columns[i])


In [42]:
# The third dataframes 
tables[2].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,Afghanistan,..,9.7,..,10.7,..,..,3.9,15.7,..,..,44,39
1,Albania,..,29.8,..,9.8,..,14.8,4.0,13.6,..,..,18,12
2,Algeria,12.2,..,21.2,..,..,..,..,..,100.0,..,24,..
3,American Samoa,..,..,..,..,..,..,..,..,..,..,..,..
4,Andorra,..,12.7,..,13.6,..,23.7,3.2,..,100.0,100.0,11,8


In [43]:
# Create education_input_df
education_input_df = tables[2]
education_input_df.columns=columns

In [44]:
education_input_df.head()

Unnamed: 0,country name,"Government expenditure per student,Primary,% of GDP per capita,1999","Government expenditure per student,Primary,% of GDP per capita,2017","Government expenditure per student,Secondary,% of GDP per capita,1999","Government expenditure per student,Secondary,% of GDP per capita,2017","Government expenditure per student,Tertiary,% of GDP per capita,1999","Government expenditure per student,Tertiary,% of GDP per capita,2017","Government expenditure on education,% of GDP,2017","Government expenditure on education,% of total government expenditure,2017","Trained teachers,Primary,% of total,2017","Trained teachers,Secondary,% of total,2017","Pupil-teacher ratio,Primary,pupils per teacher,2017","Pupil-teacher ratio,Secondary,pupils per teacher,2017"
0,Afghanistan,..,9.7,..,10.7,..,..,3.9,15.7,..,..,44,39
1,Albania,..,29.8,..,9.8,..,14.8,4.0,13.6,..,..,18,12
2,Algeria,12.2,..,21.2,..,..,..,..,..,100.0,..,24,..
3,American Samoa,..,..,..,..,..,..,..,..,..,..,..,..
4,Andorra,..,12.7,..,13.6,..,23.7,3.2,..,100.0,100.0,11,8


In [45]:
# Combine two dataset
education_unemployment_df = education_input_df.merge(unemplyment_df, on='country name',how="outer")
education_unemployment_df.head()

Unnamed: 0,country name,"Government expenditure per student,Primary,% of GDP per capita,1999","Government expenditure per student,Primary,% of GDP per capita,2017","Government expenditure per student,Secondary,% of GDP per capita,1999","Government expenditure per student,Secondary,% of GDP per capita,2017","Government expenditure per student,Tertiary,% of GDP per capita,1999","Government expenditure per student,Tertiary,% of GDP per capita,2017","Government expenditure on education,% of GDP,2017","Government expenditure on education,% of total government expenditure,2017","Trained teachers,Primary,% of total,2017",...,"Unemployment, male (% of male labor force) (modeled ILO estimate),2017","Unemployment, female (% of female labor force) (modeled ILO estimate),2000","Unemployment, female (% of female labor force) (modeled ILO estimate),2017","Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate),2000","Unemployment, youth male (% of male labor force ages 15-24) (modeled ILO estimate),2017","Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate),2000","Unemployment, youth female (% of female labor force ages 15-24) (modeled ILO estimate),2017","Unemployment with basic education (% of total labor force with basic education),2014-17","Unemployment with intermediate education (% of total labor force with intermediate education),2014-17","Unemployment with advanced education (% of total labor force with advanced education),2014-17"
0,Afghanistan,..,9.7,..,10.7,..,..,3.9,15.7,..,...,1.1,3.0,2.5,5.6,2.1,5.2,3.8,12.3,16.2,15.5
1,Albania,..,29.8,..,9.8,..,14.8,4.0,13.6,..,...,14.6,18.2,12.6,30.0,33.8,26.6,27.5,13.8,20.4,19.1
2,Algeria,12.2,..,21.2,..,..,..,..,..,100.0,...,10.0,29.7,21.1,51.3,26.6,47.1,45.5,..,..,..
3,American Samoa,..,..,..,..,..,..,..,..,..,...,..,..,..,..,..,..,..,..,..,..
4,Andorra,..,12.7,..,13.6,..,23.7,3.2,..,100.0,...,..,..,..,..,..,..,..,..,..,..


In [46]:
# Save dataframe as csv files
education_unemployment_df.to_csv("../Data/education_unemployment.csv", index=False)

In [48]:
#import remaining dependencies 
import csv
import sys, getopt, pprint

#CSV to JSON Conversion
csvfile = open("../Data/education_unemployment.csv", 'r')
reader = csv.DictReader( csvfile )

# add education_unemployment to the existing World_Development_Indicators
conn = 'mongodb://localhost:27017'
mongo_client = MongoClient(conn) 
db = client.World_Development_Indicator 

# Create a collection" unemployment_educationinput" in database
db.unemployment_educationinput.drop()
education_unemployment_df_columns = list(education_unemployment_df)
header= education_unemployment_df_columns
for each in reader:
    row={}
    for field in header:
        row[field]=each[field]

    db.unemployment_educationinput.insert_one(row)