# Trending YouTube Video Statistics

ETL project Bootcamp
<br>Estefanía González
<br>Paolo Vega
<br>23-may-2020 Version 2

## Finding Data

The data for this project was dowloaded from [Kaggle](https://www.kaggle.com/datasnaek/youtube-new/data?select=MX_category_id.json).
And the files are presented as:
- CSV: each region’s data is in a separate file. Data includes the video title, channel title, publish time, tags, views, likes and dislikes, description, and comment count.
- Json: the categories for a specific video. This file is included for each of the five regions in the dataset.

In [1]:
# Modules needed
import pandas as pd
import json
from pandas.io.json import json_normalize
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
import os
import glob
from datetime import datetime

## Data Cleanup & Analysis

### Extract

In [2]:
# Read JSON file
file = 'Resources/DataSources/json/CA_category_id.json'
# Opemn JSON File
data = json.load(open(file))
# Normalize 'items' element inside json tree (where our information is)
items_df = json_normalize(data['items'])
# Remove unwanted columns
items_df = items_df[["id","snippet.title"]].copy()
items_df.head()

Unnamed: 0,id,snippet.title
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports


In [3]:
#----- Columns to use in the csv file

columns = [
    'video_id',
    'trending_date',
    'title',
    'channel_title',
    'category_id',
    'views',
    'likes',
    'dislikes',
    'comment_count'
]
#----- All the countries that are used
countries = ['CA',
            'DE',
            'FR',
            'GB',
            'IN',
            'JP',
            'KR',
            'MX',
            'RU',
            'US',
                        
]

#------ Readind all the csv files in the folder
#------ Changing the encoding to latin1 in order to read the special characters
dataframes = []

for country in countries:
    df = [pd.read_csv(f'./Resources/DataSources/csv/{country}videos.csv',encoding='latin1',index_col=None, header=0, 
                     usecols=columns)]
    dataframes.append(df)

### Transform

In [4]:
# Rename columns
items_df = items_df.rename(columns={"id":"CategoryID","snippet.title":"CategoryName"})
# Set index to ID column
#items_df = items_df.set_index("CategoryID")
items_df

Unnamed: 0,CategoryID,CategoryName
0,1,Film & Animation
1,2,Autos & Vehicles
2,10,Music
3,15,Pets & Animals
4,17,Sports
5,18,Short Movies
6,19,Travel & Events
7,20,Gaming
8,21,Videoblogging
9,22,People & Blogs


### Database set up
Preparing the necessary credential to use Postgres

In [5]:
# import variables from config file
from credentials import host
from credentials import pwd
from credentials import usr
from credentials import dialect
from credentials import port
from credentials import db

engine = create_engine(f'{dialect}://{usr}:{pwd}@{host}:{port}/{db}')

# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)
# display tables/classes
display(Base.classes.keys())

# Save references to each table
Category = Base.classes.Category
Country = Base.classes.Country
Video = Base.classes.Video

# Start session
session = Session(engine)

['Category', 'Country', 'Video']

In [23]:
# Read Tables to determine what task to perform in each
category_data = pd.read_sql_table("Category",engine)
country_data = pd.read_sql_table("Country",engine)
video_data = pd.read_sql_table("Video",engine)
engine.dispose()

### Transform

In [7]:
# Load information
try:
    new_items_df.to_sql('Category',con=engine,index=False,if_exists='append',chunksize=len(new_items_df))
    print("Success!")
except:
    print("Error inserting into DB")


Error inserting into DB


In [8]:
#Transform
#------ Datarframes of all countries availables 

ca = dataframes[0][0]
de = dataframes[1][0]
fr = dataframes[2][0]
gb = dataframes[3][0]
in_ = dataframes[4][0]
jp = dataframes[5][0]
kr = dataframes[6][0]
mx = dataframes[7][0]
ru = dataframes[8][0]
us = dataframes[9][0]


#----- Adding the CountryID to each DF
ca['CountryID'] = 1
de['CountryID'] = 2
fr['CountryID'] = 3
gb['CountryID'] = 4
in_['CountryID'] = 5
jp['CountryID'] = 6
kr['CountryID'] = 7
mx['CountryID'] = 8
ru['CountryID'] = 9
us['CountryID'] = 10




In [9]:
#----- Creating the DF with all the countries

Video_df = ca.append([de,fr,gb,in_,jp,kr,mx,ru,us])

In [10]:
#------ Renaming the columns to export the data into the created Database

Video_df.rename(columns={
    'video_id' :'VideoID', 
    'trending_date':'TrendingDate',
    'title':'Title', 
    'channel_title':'Channel', 
    'category_id':'CategoryID',
    'views':'Views', 
    'likes':'Likes', 
    'dislikes' :'Dislikes', 
    'comment_count':'Comments'
}, inplace = True)

Video_df['Title'] = Video_df['Title'].str.encode('latin1').str.decode('utf8')
Video_df['Channel'] = Video_df['Channel'].str.encode('latin1').str.decode('utf8')

In [11]:
#-----  Removing leading spaces and capitalize the string
Video_df["VideoID"] = Video_df["VideoID"].str.strip().str.capitalize()
Video_df['TrendingDate']= Video_df['TrendingDate'].str.replace('.','-')
Video_df['TrendingDate']=pd.to_datetime(Video_df['TrendingDate'], format ="%y-%d-%m")
Video_df["Title"] = Video_df["Title"].str.strip().str.capitalize()
Video_df["Channel"]=Video_df["Channel"].str.strip().str.capitalize()

idx = 0
new_col = list(range(1,len(Video_df["VideoID"])+1))  # can be a list, a Series, an array or a scalar   
Video_df.insert(loc=idx, column='VideoSRID', value=new_col)

In [12]:
# Extract
#------- Creating the table with the countriesID

index=list(range(1,len(countries)+1))
Country_df = pd.DataFrame({'CountryID':index,'CountryName':countries})

In [13]:
# Transform
# insert Countrty (empty table)
Country_df["CountryID"] = Country_df["CountryID"].astype("int")
Country_df.applymap(lambda x: x.strip().capitalize() if isinstance(x, str) else x)


Unnamed: 0,CountryID,CountryName
0,1,Ca
1,2,De
2,3,Fr
3,4,Gb
4,5,In
5,6,Jp
6,7,Kr
7,8,Mx
8,9,Ru
9,10,Us


In [14]:
# Validate data into DB
category_data = pd.read_sql_table("Category",engine)
category_data.head()

Unnamed: 0,CategoryID,CategoryName


### Load

In [15]:
# Load information

# Cleaunup rules for dataframe columns (improve performance)
# remove leading spaces and capitalize the string
items_df["CategoryID"] = items_df["CategoryID"].astype("int")
items_df.applymap(lambda x: x.strip().capitalize() if isinstance(x, str) else x)

new_items_df = items_df.merge(category_data, how="left", on="CategoryID")

new_items_df = new_items_df[new_items_df['CategoryName_y'].isnull()]
new_items_df = new_items_df[['CategoryID','CategoryName_x']]
new_items_df = new_items_df.rename(columns={"CategoryName_x":"CategoryName"})
new_items_df.head()


try:
    new_items_df.to_sql('Category',con=engine,index=False,if_exists='append',chunksize=len(new_items_df))
    print("Success!")
except:
    print("Error inserting into DB")


Success!


In [16]:

new_countries_df = Country_df.merge(country_data, how="left", on="CountryID")

new_countries_df = new_countries_df[new_countries_df['CountryName_y'].isnull()]
new_countries_df = new_countries_df[['CountryID','CountryName_x']]
new_countries_df = new_countries_df.rename(columns={"CountryName_x":"CountryName"})
new_countries_df.head()
# Load

try:
    new_countries_df.to_sql('Country', con=engine, index=False, if_exists='append',chunksize=len(new_countries_df))
    print("Success!")
except:
    print("Error inserting into the DB")

Success!


In [19]:
#---- Extracting the CategoryID from the DF Video_df, in order to see if all categories are inclueded

category_data2=pd.DataFrame(data=Video_df['CategoryID'].drop_duplicates())
category_data2.reset_index(inplace = True)
category_data2.drop(columns=['index'],inplace = True)


In [25]:
#----- Merging with the previous categories that are loaded in the database

categorias_df= pd.merge(category_data2,category_data, on='CategoryID', how='left')


In [26]:
categorias_df

Unnamed: 0,CategoryID,CategoryName
0,10,Music
1,23,Comedy
2,24,Entertainment
3,25,News & Politics
4,22,People & Blogs
5,26,Howto & Style
6,1,Film & Animation
7,28,Science & Technology
8,20,Gaming
9,17,Sports


In [27]:
#----- Selecting the ones that are not included in the database
new_cat= categorias_df[categorias_df['CategoryName'].isnull()]
#------ Adding the New category to format purposes
new_cat.fillna("New Category", inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  **kwargs


In [28]:
#Adding the new Categories into the table Category
for index, row in new_cat.iterrows():
        # create a new object (category)
        newCategory = Category(CategoryID = row["CategoryID"], CategoryName = row["CategoryName"])
        # insert into DB
        session.add(newCategory)
        print("Done!")
# Commit changes
try:
    session.commit()
    print("Commit!")
except:
    session.rollback()
    print("Rollback :(")


Done!
Commit!


In [29]:
Video_df.to_sql('Video',con=engine,index=False,if_exists='append', chunksize = 1000)
        