In [14]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import pymongo
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

# Team SIGMA

# Extract information

## How much time is spent on Education vs. PISA Score by Country

In [3]:
# read the time spent on different activities
excel_file = "Resources/Time-Use-in-OECD-Countries-OECD.xlsx"
time_use_df = pd.read_excel(excel_file)
time_use_df.head()

Unnamed: 0,Country,Category,Time (hours)
0,Australia,Paid work,211.14663
1,Austria,Paid work,279.532268
2,Belgium,Paid work,194.476452
3,Canada,Paid work,268.66061
4,Denmark,Paid work,199.771596


In [4]:
time_use_df["Category"].unique()

array(['Paid work', 'Education', 'Care for household members ',
       'Housework', 'Shopping', 'Other unpaid work & volunteering',
       'Sleep', 'Eating and drinking', 'Personal care', 'Sports',
       'Attending events', 'Seeing friends', 'TV and Radio',
       'Other leisure activities'], dtype=object)

In [5]:
time_use_df["Country"].unique()

array(['Australia', 'Austria', 'Belgium', 'Canada', 'Denmark', 'Estonia',
       'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland',
       'Italy ', 'Japan', 'Korea', 'Latvia', 'Lithuania', 'Luxembourg',
       'Mexico', 'Netherlands', 'New Zealand', 'Norway ', 'Poland',
       'Portugal', 'Slovenia', 'Spain', 'Sweden', 'Turkey', 'UK', 'USA',
       'China', 'India', 'South Africa'], dtype=object)

In [6]:
# read the PISA score from each country (educational score)
csv_file = "Resources/pisa-test-score-mean-performance-on-the-science-scale.csv"
pisa_df = pd.read_csv(csv_file)
pisa_df.head()

Unnamed: 0,Entity,Code,Year,PISA: Mean performance on the science scale
0,Albania,ALB,2000,376.453214
1,Albania,ALB,2009,390.695746
2,Albania,ALB,2012,397.373214
3,Albania,ALB,2015,427.225
4,Algeria,DZA,2015,375.7451


## Alcohol Consumption vs. Eating and Drinking time spent by Country 

In [20]:
# read the Alcohol consumption by Country (pure alcohol in liters)
xmart_csv = "Resources/xmart.csv"
xmart_df = pd.read_csv(xmart_csv)
xmart_df.head()

Unnamed: 0,Country,Data Source,Beverage Types,2018,2017,2016,2015,2014,2013,2012,2011,2010
0,Afghanistan,Data source,All types,0.01,0.01,0.02,0.0,0.01,0.01,0.01,0.01,0.01
1,Afghanistan,Data source,Beer,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.01,0.01
2,Afghanistan,Data source,Wine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Afghanistan,Data source,Spirits,0.01,0.01,0.01,0.0,0.01,0.0,0.0,0.0,0.0
4,Afghanistan,Data source,Other alcoholic beverages,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Transform

## How much time is spent on Education vs. PISA Score by Country

In [7]:
# Clean dataframe to have only the most recent score data
year = pisa_df["Year"] == 2015
pisa_2015_df = pisa_df.loc[year]
pisa_2015_df.head()

Unnamed: 0,Entity,Code,Year,PISA: Mean performance on the science scale
3,Albania,ALB,2015,427.225
4,Algeria,DZA,2015,375.7451
9,Argentina,ARG,2015,432.2262
15,Australia,AUS,2015,509.9939
21,Austria,AUT,2015,495.0375


In [8]:
# Get only the time spent on education by country
education = time_use_df["Category"] == "Education"
time_use_education_df = time_use_df.loc[education]
time_use_education_df.head()

Unnamed: 0,Country,Category,Time (hours)
33,Australia,Education,27.018763
34,Austria,Education,26.897949
35,Belgium,Education,41.277172
36,Canada,Education,35.980637
37,Denmark,Education,25.044998


In [9]:
# Merge two data sets into one by country
education_df = time_use_education_df.merge(pisa_2015_df, left_on="Country", right_on="Entity")
education_df.head()

Unnamed: 0,Country,Category,Time (hours),Entity,Code,Year,PISA: Mean performance on the science scale
0,Australia,Education,27.018763,Australia,AUS,2015,509.9939
1,Austria,Education,26.897949,Austria,AUT,2015,495.0375
2,Belgium,Education,41.277172,Belgium,BEL,2015,501.9997
3,Canada,Education,35.980637,Canada,CAN,2015,527.7047
4,Denmark,Education,25.044998,Denmark,DNK,2015,501.9369


In [10]:
# delete columns that we don't need
education_df.drop(['Entity', 'Code', 'Category'], axis=1, inplace=True)
education_df.head()

Unnamed: 0,Country,Time (hours),Year,PISA: Mean performance on the science scale
0,Australia,27.018763,2015,509.9939
1,Austria,26.897949,2015,495.0375
2,Belgium,41.277172,2015,501.9997
3,Canada,35.980637,2015,527.7047
4,Denmark,25.044998,2015,501.9369


In [11]:
education_df.columns

Index(['Country', 'Time (hours)', 'Year',
       'PISA: Mean performance on the science scale'],
      dtype='object')

## Alcohol Consumption vs. Eating and Drinking time spent by Country 

In [18]:
eat_var = time_use_df["Category"] == 'Eating and drinking'
eat_drink_time_df = time_use_df.loc[eat_var]
eat_drink_time_df.head()

Unnamed: 0,Country,Category,Time (hours)
231,Australia,Eating and drinking,89.061849
232,Austria,Eating and drinking,78.836406
233,Belgium,Eating and drinking,98.516408
234,Canada,Eating and drinking,64.819713
235,Denmark,Eating and drinking,119.15341


In [19]:
all_var = xmart_df["Beverage Types"] == ' All types'
consumption_all_df = xmart_df.loc[all_var]
consumption_all_df.head()

NameError: name 'xmart_df' is not defined

In [None]:
consumption_all_df["Avg"] = consumption_all_df[["2010","2011","2012","2013","2014","2015"]].mean(axis=1)
consumption_all_df.head()

In [None]:
# Merge two data sets into one by country
consumption_df = eat_drink_time_df.merge(consumption_all_df, on="Country")
consumption_df.head()

In [None]:
consumption_df.columns

In [None]:
consumption_list = []

for idx,row in consumption_df.iterrows():
    cons_dict = {
        "Country": row["Country"],
        "time_spent": row["Time (hours)"],
        "avg_consumption": row["Avg"]
    }
    consumption_list.append(cons_dict)

# Load

In [12]:
# PostgreSQL Connection
project_db = "etl_project_db" #poner nombre de base de datos que creemos
rds_connection_string = f"postgres:1234@localhost:5432/{project_db}"
engine = create_engine(f'postgresql://{rds_connection_string}')
engine.table_names()

['country', 'time_use', 'category', 'education', 'alcohol']

In [16]:
Base = automap_base()
Base.prepare(engine, reflect=True)
Base.classes.keys()

['country', 'time_use', 'category', 'education', 'alcohol']

In [17]:
session = Session(engine)

In [19]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [27]:
# Define database
db = client.etl_project_db

# create list to insert
education_list = []

In [28]:
# iterate the dataframe to populate list with only the relevant data needed
for index,row in education_df.iterrows():
    education = {
        "Country": row["Country"],
        "Time (hours)": row['Time (hours)'],
        "PISA: Mean performance on the science scale": row["PISA: Mean performance on the science scale"]
    }
    education_list.append(education)

In [29]:
# create collection and insert many results
db.score_time_education.insert_many(education_list)

<pymongo.results.InsertManyResult at 0x2113610d440>

In [None]:
# create collection and insert many results
db.time_alcohol_consumption.insert_many(consumption_list)