In [None]:
# ETL project Bootcamp
# Estefanía González
# Paolo Vega
# 20-may-2020 Version 1.0.1

In [1]:
# Modules needed
import pandas as pd
import json
from pandas.io.json import json_normalize
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
import os
import glob

In [2]:
# Read JSON file
file = 'Resources/DataSources/json/CA_category_id.json'
# Opemn JSON File
data = json.load(open(file))
# Normalize 'items' element inside json tree (where our information is)
items_df = json_normalize(data['items'])
# Remove unwanted columns
items_df = items_df[["id","snippet.title"]].copy()
items_df.head()

Unnamed: 0,id,snippet.title
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [3]:
# Rename columns
items_df = items_df.rename(columns={"id":"CategoryID","snippet.title":"CategoryName"})
# Set index to ID column
#items_df = items_df.set_index("CategoryID")
items_df.head()

Unnamed: 0,CategoryID,CategoryName
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [4]:
# import variables from config file
from credentials import host
from credentials import pwd
from credentials import usr
from credentials import dialect
from credentials import port
from credentials import db

engine = create_engine(f'{dialect}://{usr}:{pwd}@{host}:{port}/{db}')

# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)
# display tables/classes
display(Base.classes.keys())

# Save references to each table
Category = Base.classes.Category
Country = Base.classes.Country
Video = Base.classes.Video

# Start session
session = Session(engine)

['Category', 'Video', 'Country']

In [20]:
# Read Tables to determine what task to perform in each
category_data = pd.read_sql_table("Category",engine)
country_data = pd.read_sql_table("Country",engine)
video_data = pd.read_sql_table("Video",engine)
engine.dispose()

In [19]:
# insert strategy (empty table)
items_df.head()
if len(category_data) == 0:
    print("Table empty. \n Inserting values...")
    for index, row in items_df.iterrows():
        # remove leading spaces and capitalize the string
        categoryNameTemp = row["CategoryName"].strip().capitalize()
        # create a new object (category)
        newCategory = Category(CategoryID = row["CategoryID"], CategoryName = (categoryNameTemp))
        # insert into DB
        session.add(newCategory)
    print("Done!")
# update Strategy (table with existing rows)
else:
    print("Table not empty")
    # TO DO

# Commit changes
session.commit()

Table empty. 
 Inserting values...
Done!


In [22]:
# Validate data into DB
category_data = pd.read_sql_table("Category",engine)
category_data

Unnamed: 0,CategoryID,CategoryName
0,1,Film & animation
1,2,Autos & vehicles
2,10,Music
3,15,Pets & animals
4,17,Sports
5,18,Short movies
6,19,Travel & events
7,20,Gaming
8,21,Videoblogging
9,22,People & blogs


In [None]:
#----- Columns to use in the csv file

columns = [
    'video_id',
    'trending_date',
    'title',
    'channel_title',
    'category_id',
    'views',
    'likes',
    'dislikes',
    'comment_count'
]
#----- All the countries that are used
countries = ['CA',
            'DE',
            'FR',
            'GB',
            'IN',
            'JP',
            'KR',
            'MX',
            'RU',
            'US'
]

In [None]:
#------ Readind all the csv files in the folder
#------ Changing the encoding to latin1 in order to read the special characters
dataframes = []

for country in countries:
    df = [pd.read_csv(f'./DataSources/Resources/csv/{country}videos.csv',encoding='latin1',index_col=None, header=0, 
                     usecols=columns)]
    dataframes.append(df)

In [None]:
#------ Datarframes of all countries availables 

ca = dataframes[0][0]
de = dataframes[1][0]
fr = dataframes[2][0]
gb = dataframes[3][0]
in_ = dataframes[4][0]
jp = dataframes[5][0]
kr = dataframes[6][0]
mx = dataframes[7][0]
ru = dataframes[8][0]
us = dataframes[9][0]

#----- Adding the CountryID to each DF
ca['CountryID'] = 1
de['CountryID'] = 2
fr['CountryID'] = 3
gb['CountryID'] = 4
in_['CountryID'] = 5
jp['CountryID'] = 6
kr['CountryID'] = 7
mx['CountryID'] = 8
ru['CountryID'] = 9
us['CountryID'] = 10

In [None]:
#----- Creating the DF with all the countries

Video = ca.append([ca,de,fr,gb,in_,jp,kr,mx,ru,us])

In [None]:
#------ Renaming the columns to export the data into the created Database

Video.rename(columns={
    'video_id' :'VideoID', 
    'trending_date':'TrendingDate',
    'title':'Title', 
    'channel_title':'Channel', 
    'category_id':'CategoryID',
    'views':'Views', 
    'likes':'Likes', 
    'dislikes' :'Dislikes', 
    'comment_count':'Comments'
}, inplace = True)

In [None]:
#------- Creating the table with the countriesID

index=list(range(1,len(countries)+1))
Country = pd.DataFrame({'CountryID':index,'CountryName':countries})
Country.set_index('CountryID',inplace = True)