# To create the table the following steps were taken:

In [22]:
# Dependencies
from sqlalchemy import create_engine, inspect
from sqlalchemy import Column, Integer, String, Float
import pandas as pd
import pymongo
import json
import pprint

# Extracting and transforming data one

In [23]:
# Path to sqlite
database_path = "../Data/wdi_kaggle.sqlite"

In [24]:
# Creating engine
engine = create_engine(f"sqlite:///{database_path}")
for table_name in inspect(engine).get_table_names():
   print(table_name)

Country
CountryNotes
Footnotes
Indicators
Series
SeriesNotes


In [25]:
# Connecting to engine
conn = engine.connect()

In [26]:
# Selecting tables
country_data = pd.read_sql("SELECT * FROM Country", conn)
country_data.head()

Unnamed: 0,CountryCode,ShortName,TableName,LongName,Alpha2Code,CurrencyUnit,SpecialNotes,Region,IncomeGroup,Wb2Code,...,GovernmentAccountingConcept,ImfDataDisseminationStandard,LatestPopulationCensus,LatestHouseholdSurvey,SourceOfMostRecentIncomeAndExpenditureData,VitalRegistrationComplete,LatestAgriculturalCensus,LatestIndustrialData,LatestTradeData,LatestWaterWithdrawalData
0,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,Fiscal year end: March 20; reporting period fo...,South Asia,Low income,AF,...,Consolidated central government,General Data Dissemination System (GDDS),1979,"Multiple Indicator Cluster Survey (MICS), 2010/11","Integrated household survey (IHS), 2008",,2013/14,,2013.0,2000.0
1,ALB,Albania,Albania,Republic of Albania,AL,Albanian lek,,Europe & Central Asia,Upper middle income,AL,...,Budgetary central government,General Data Dissemination System (GDDS),2011,"Demographic and Health Survey (DHS), 2008/09",Living Standards Measurement Study Survey (LSM...,Yes,2012,2011.0,2013.0,2006.0
2,DZA,Algeria,Algeria,People's Democratic Republic of Algeria,DZ,Algerian dinar,,Middle East & North Africa,Upper middle income,DZ,...,Budgetary central government,General Data Dissemination System (GDDS),2008,"Multiple Indicator Cluster Survey (MICS), 2012","Integrated household survey (IHS), 1995",,,2010.0,2013.0,2001.0
3,ASM,American Samoa,American Samoa,American Samoa,AS,U.S. dollar,,East Asia & Pacific,Upper middle income,AS,...,,,2010,,,Yes,2007,,,
4,ADO,Andorra,Andorra,Principality of Andorra,AD,Euro,,Europe & Central Asia,High income: nonOECD,AD,...,,,2011. Population data compiled from administra...,,,Yes,,,2006.0,


In [27]:
# Selecting columns
country_columns = pd.read_sql("SELECT CountryCode, Region, IncomeGroup FROM Country", conn)
country_columns.head()

Unnamed: 0,CountryCode,Region,IncomeGroup
0,AFG,South Asia,Low income
1,ALB,Europe & Central Asia,Upper middle income
2,DZA,Middle East & North Africa,Upper middle income
3,ASM,East Asia & Pacific,Upper middle income
4,ADO,Europe & Central Asia,High income: nonOECD


In [28]:
# Extracting tables
country_columns_data = engine.execute('SELECT CountryCode, Region, IncomeGroup FROM Country').fetchall()
[*country_columns_data]

[('AFG', 'South Asia', 'Low income'),
 ('ALB', 'Europe & Central Asia', 'Upper middle income'),
 ('DZA', 'Middle East & North Africa', 'Upper middle income'),
 ('ASM', 'East Asia & Pacific', 'Upper middle income'),
 ('ADO', 'Europe & Central Asia', 'High income: nonOECD'),
 ('AGO', 'Sub-Saharan Africa', 'Upper middle income'),
 ('ATG', 'Latin America & Caribbean', 'High income: nonOECD'),
 ('ARB', '', ''),
 ('ARG', 'Latin America & Caribbean', 'High income: nonOECD'),
 ('ARM', 'Europe & Central Asia', 'Lower middle income'),
 ('ABW', 'Latin America & Caribbean', 'High income: nonOECD'),
 ('AUS', 'East Asia & Pacific', 'High income: OECD'),
 ('AUT', 'Europe & Central Asia', 'High income: OECD'),
 ('AZE', 'Europe & Central Asia', 'Upper middle income'),
 ('BHR', 'Middle East & North Africa', 'High income: nonOECD'),
 ('BGD', 'South Asia', 'Lower middle income'),
 ('BRB', 'Latin America & Caribbean', 'High income: nonOECD'),
 ('BLR', 'Europe & Central Asia', 'Upper middle income'),
 ('BEL

In [29]:
# Converting into DataFrame
country_columns_data_df = pd.DataFrame(country_columns_data)
country_columns_data_df.head()

Unnamed: 0,0,1,2
0,AFG,South Asia,Low income
1,ALB,Europe & Central Asia,Upper middle income
2,DZA,Middle East & North Africa,Upper middle income
3,ASM,East Asia & Pacific,Upper middle income
4,ADO,Europe & Central Asia,High income: nonOECD


In [30]:
# Renaming columns
new_country_columns_df = country_columns_data_df.rename(columns={0: 'Country Code', 1: 'Region', 2: 'Income Group'})
new_country_columns_df.head()

Unnamed: 0,Country Code,Region,Income Group
0,AFG,South Asia,Low income
1,ALB,Europe & Central Asia,Upper middle income
2,DZA,Middle East & North Africa,Upper middle income
3,ASM,East Asia & Pacific,Upper middle income
4,ADO,Europe & Central Asia,High income: nonOECD


# Extracting and transforming data two

In [31]:
# Storing filepath in a variable
second_data = "../Data/API_EN.csv"

In [32]:
# Reading the data
second_data_df = pd.read_csv(second_data, skiprows=4)
second_data_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,Unnamed: 63
0,Aruba,ABW,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,24.670529,24.505835,13.155542,8.351294,8.408363,,,,,
1,Afghanistan,AFG,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.04606,0.053604,0.073765,0.074233,0.086292,0.101467,...,0.293837,0.412017,0.350371,0.315602,0.299445,,,,,
2,Angola,AGO,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.097472,0.079038,0.201289,0.192535,0.201003,0.191528,...,1.243406,1.252789,1.330843,1.254617,1.291328,,,,,
3,Albania,ALB,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,1.258195,1.374186,1.439956,1.181681,1.111742,1.166099,...,1.578574,1.803715,1.692908,1.749211,1.978763,,,,,
4,Andorra,AND,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,6.122595,5.86713,5.916597,5.900753,5.83217,,,,,


In [33]:
# Dropping columns
new_table_df = second_data_df.drop(columns=['Unnamed: 63'])
new_table_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Aruba,ABW,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,25.915833,24.670529,24.505835,13.155542,8.351294,8.408363,,,,
1,Afghanistan,AFG,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.04606,0.053604,0.073765,0.074233,0.086292,0.101467,...,0.241723,0.293837,0.412017,0.350371,0.315602,0.299445,,,,
2,Angola,AGO,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.097472,0.079038,0.201289,0.192535,0.201003,0.191528,...,1.232495,1.243406,1.252789,1.330843,1.254617,1.291328,,,,
3,Albania,ALB,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,1.258195,1.374186,1.439956,1.181681,1.111742,1.166099,...,1.4956,1.578574,1.803715,1.692908,1.749211,1.978763,,,,
4,Andorra,AND,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,,,...,6.121652,6.122595,5.86713,5.916597,5.900753,5.83217,,,,


In [34]:
# Merging tables
result_one = pd.merge(new_country_columns_df, new_table_df, on='Country Code')
result_one

Unnamed: 0,Country Code,Region,Income Group,Country Name,Indicator Name,Indicator Code,1960,1961,1962,1963,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,AFG,South Asia,Low income,Afghanistan,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.046060,0.053604,0.073765,0.074233,...,0.241723,0.293837,0.412017,0.350371,0.315602,0.299445,,,,
1,ALB,Europe & Central Asia,Upper middle income,Albania,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,1.258195,1.374186,1.439956,1.181681,...,1.495600,1.578574,1.803715,1.692908,1.749211,1.978763,,,,
2,DZA,Middle East & North Africa,Upper middle income,Algeria,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.553764,0.531810,0.484954,0.452824,...,3.423011,3.299704,3.291376,3.460266,3.507310,3.717410,,,,
3,ASM,East Asia & Pacific,Upper middle income,American Samoa,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,...,,,,,,,,,,
4,AGO,Sub-Saharan Africa,Upper middle income,Angola,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.097472,0.079038,0.201289,0.192535,...,1.232495,1.243406,1.252789,1.330843,1.254617,1.291328,,,,
5,ATG,Latin America & Caribbean,High income: nonOECD,Antigua and Barbuda,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.662643,0.849084,1.796794,1.446821,...,5.446757,5.539568,5.363407,5.418447,5.360453,5.377649,,,,
6,ARB,,,Arab World,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,0.643689,0.685151,0.760855,0.874941,...,4.542151,4.615758,4.537755,4.813631,4.650474,4.860234,,,,
7,ARG,Latin America & Caribbean,High income: nonOECD,Argentina,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,2.367473,2.442616,2.522392,2.316356,...,4.410890,4.558500,4.600291,4.569384,4.462904,4.746797,,,,
8,ARM,Europe & Central Asia,Lower middle income,Armenia,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,...,1.509412,1.465622,1.710071,1.976060,1.899712,1.902759,,,,
9,ABW,Latin America & Caribbean,High income: nonOECD,Aruba,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,,,,,...,25.915833,24.670529,24.505835,13.155542,8.351294,8.408363,,,,


In [35]:
# Saving the result in json
result_one_dict = json.loads(result_one.to_json()).values()

# Loading into MongoDB

In [36]:
# Connecting to MongoDB
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [37]:
# Defining the database in Mongo
db = client.carbon_dioxide_DB

In [38]:
# Declaring the collection
carbon_dioxide_col = db.carbon_dioxide

In [39]:
# Inserting data
carbon_dioxide_col.insert_many(result_one.to_dict('records'))
client.close()