In [None]:
# ETL project Bootcamp
# Estefanía González
# Paolo Vega
# 20-may-2020 Version 1.0.1

In [1]:
# Modules needed
import pandas as pd
import json
from pandas.io.json import json_normalize
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
import os
import glob
from datetime import datetime

In [2]:
# Read JSON file
file = 'Resources/DataSources/json/CA_category_id.json'
# Opemn JSON File
data = json.load(open(file))
# Normalize 'items' element inside json tree (where our information is)
items_df = json_normalize(data['items'])
# Remove unwanted columns
items_df = items_df[["id","snippet.title"]].copy()
items_df.head()

Unnamed: 0,id,snippet.title
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [3]:
# Rename columns
items_df = items_df.rename(columns={"id":"CategoryID","snippet.title":"CategoryName"})
# Set index to ID column
#items_df = items_df.set_index("CategoryID")
items_df.head()

Unnamed: 0,CategoryID,CategoryName
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [4]:
# import variables from config file
from credentials import host
from credentials import pwd
from credentials import usr
from credentials import dialect
from credentials import port
from credentials import db

engine = create_engine(f'{dialect}://{usr}:{pwd}@{host}:{port}/{db}')

# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)
# display tables/classes
display(Base.classes.keys())

# Save references to each table
Category = Base.classes.Category
Country = Base.classes.Country
Video = Base.classes.Video

# Start session
session = Session(engine)

['Category', 'Country', 'Video']

In [5]:
# Read Tables to determine what task to perform in each
category_data = pd.read_sql_table("Category",engine)
country_data = pd.read_sql_table("Country",engine)
video_data = pd.read_sql_table("Video",engine)
engine.dispose()

In [6]:
# insert strategy (empty table)
items_df.head()
if len(category_data) == 0:
    print("Table empty. \n Inserting values...")
    for index, row in items_df.iterrows():
        # remove leading spaces and capitalize the string
        categoryNameTemp = row["CategoryName"].strip().capitalize()
        # create a new object (category)
        newCategory = Category(CategoryID = row["CategoryID"], CategoryName = (categoryNameTemp))
        # insert into DB
        session.add(newCategory)
    print("Done!")
# update Strategy (table with existing rows)
else:
    print("Table not empty")
    # TO DO

# Commit changes
try:
    session.commit()
    print("Commit!")
except:
    session.rollback()
    print("Rollback :(")

Table not empty
Commit!


In [7]:
# Validate data into DB
category_data = pd.read_sql_table("Category",engine)
category_data

Unnamed: 0,CategoryID,CategoryName
0,1,Film & animation
1,2,Autos & vehicles
2,10,Music
3,15,Pets & animals
4,17,Sports
5,18,Short movies
6,19,Travel & events
7,20,Gaming
8,21,Videoblogging
9,22,People & blogs


In [8]:
#----- Columns to use in the csv file

columns = [
    'video_id',
    'trending_date',
    'title',
    'channel_title',
    'category_id',
    'views',
    'likes',
    'dislikes',
    'comment_count'
]
#----- All the countries that are used
countries = ['CA',
            'DE',
            'FR',
            'GB',
            'IN',
            'JP',
            'KR',
            'MX',
            'RU',
            'US'
]

In [9]:
#------ Readind all the csv files in the folder
#------ Changing the encoding to latin1 in order to read the special characters
dataframes = []

for country in countries:
    df = [pd.read_csv(f'./Resources/DataSources/csv/{country}videos.csv',encoding='latin1',index_col=None, header=0, 
                     usecols=columns)]
    dataframes.append(df)

In [10]:
#------ Datarframes of all countries availables 

ca = dataframes[0][0]
de = dataframes[1][0]
fr = dataframes[2][0]
gb = dataframes[3][0]
in_ = dataframes[4][0]
jp = dataframes[5][0]
kr = dataframes[6][0]
mx = dataframes[7][0]
ru = dataframes[8][0]
us = dataframes[9][0]

#----- Adding the CountryID to each DF
ca['CountryID'] = 1
de['CountryID'] = 2
fr['CountryID'] = 3
gb['CountryID'] = 4
in_['CountryID'] = 5
jp['CountryID'] = 6
kr['CountryID'] = 7
mx['CountryID'] = 8
ru['CountryID'] = 9
us['CountryID'] = 10

In [11]:
#----- Creating the DF with all the countries

Video_df = ca.append([ca,de,fr,gb,in_,jp,kr,mx,ru,us])
#Video_df = ca.append([mx])
Video_df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,views,likes,dislikes,comment_count,CountryID
0,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. BeyoncÃ©,EminemVEVO,10,17158579,787425,43420,125882,1
1,0dBIkQ4Mz1M,17.14.11,PLUSH - Bad Unboxing Fan Mail,iDubbbzTV,23,1014651,127794,1688,13030,1
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,3191434,146035,5339,8181,1
3,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2095828,132239,1989,17518,1
4,2Vv-BfVoq4g,17.14.11,Ed Sheeran - Perfect (Official Music Video),Ed Sheeran,10,33523622,1634130,21082,85067,1


In [12]:
#------ Renaming the columns to export the data into the created Database

Video_df.rename(columns={
    'video_id' :'VideoID', 
    'trending_date':'TrendingDate',
    'title':'Title', 
    'channel_title':'Channel', 
    'category_id':'CategoryID',
    'views':'Views', 
    'likes':'Likes', 
    'dislikes' :'Dislikes', 
    'comment_count':'Comments'
}, inplace = True)

In [13]:
#------- Creating the table with the countriesID

index=list(range(1,len(countries)+1))
Country_df = pd.DataFrame({'CountryID':index,'CountryName':countries})


In [None]:
Country_df

In [14]:
# insert strategy (empty table)

if len(country_data) == 0:
    print("Table empty. \n Inserting values...")
    for index, row in Country_df.iterrows():
        # remove leading spaces and capitalize the string
        countryNameTemp = row["CountryName"].strip().upper()
        # create a new object (category)
        newCountry = Country(CountryID = row['CountryID'], CountryName = (countryNameTemp))
        # insert into DB
        session.add(newCountry)
    print("Done!")
# update Strategy (table with existing rows)
else:
    print("Table not empty")
    # TO DO

# Commit changes
try:
    session.commit()
    print("Commit!")
except:
    session.rollback()
    print("Rollback :(")

Table not empty
Commit!


In [15]:

if len(video_data) == 0:
    print("Table empty. \n Inserting values...")
    for index, row in Video_df.iterrows():
        if(not isinstance(row["VideoID"],str)):
            continue
        # remove leading spaces and capitalize the string
        videoIdTemp = row["VideoID"].strip().capitalize()
        trendingDateTemp = f'20{row["TrendingDate"]}'
        trendingDateTemp = trendingDateTemp.replace(".","-")
        trendingDateTemp = datetime.strptime(trendingDateTemp,"%Y-%d-%m")
        # Get only 200 characteres as defined in the DB
        titleTemp = row["Title"].strip().capitalize()[:200]
        channelTemp = row["Channel"].strip().capitalize()
        viewsTemp = row["Views"]
        likesTemp =  row['Likes']
        dislikesTemp= row["Dislikes"]
        commentsTemp = row["Comments"]
        countryIDTemp = row["CountryID"]
        
        # Compare to the existing dategory in the DB (integrity)
        categoryIDTemp = row["CategoryID"]
        flag = False
        # Search and compare the complete categories in DB
        for index, row in category_data.iterrows():
            if(categoryIDTemp == row["CategoryID"]):
                # Category found!
                flag = True
                break
        if(flag):
            # create a new object (category)
            newVideo = Video(
            VideoID = videoIdTemp, TrendingDate = trendingDateTemp,
            Title= titleTemp, Channel=channelTemp, CategoryID = categoryIDTemp,
            Views = viewsTemp, Likes = likesTemp, Dislikes = dislikesTemp,
            Comments = commentsTemp, CountryID = countryIDTemp)
            # insert into DB
            session.add(newVideo)
    print("Done!")
    # Commit changes
    try:
        session.commit()
        print("Commit!")
    except:
        session.rollback()
        print("Rollback :(")
# update Strategy (table with existing rows)
else:
    print("Table not empty")
    # TO DO



Table empty. 
 Inserting values...
Done!
Commit!
