## YouTube Trending Project
Analyzing data from the YouTube trending page in the US
over the span of a few days (10/23-27/2020)

Goal: 
* To understand common characteristics of trending videos in different countries

* To predict engagement (likes or comments) on a video in english speaking countries

## Table of Contents:
* 1. Data Overview
    * 1.1 Data Analysis 
* 2. Cleaning
    * 2.1 Feature Engineering
    * 2.2 Column Dropping
* 3. Modeling

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

# Encoding and Data Split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
# import category_encoders as ce

# Plotting Modules
import matplotlib.pyplot as plt

# Reading the stitched data
trend_data = pd.read_csv("../YouTube-Trending/Data/US_10.23-28.20.csv")
# Set seed for reproducibility
np.random.seed(0)

df = trend_data.copy()
df.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description,duration,country
0,bPiofmZGb8o,Second 2020 Presidential Debate between Donald...,2020-10-23T02:49:33Z,UCb--64Gl51jIEVE-GLDAVTg,C-SPAN,25,20.23.10,C-SPAN|CSPAN|2020|Donald Trump|Republican|Whit...,6641600,94601,6209,59293,https://i.ytimg.com/vi/bPiofmZGb8o/default.jpg,False,False,President Donald Trump and former Vice Preside...,1H59M15S,US
1,tcYodQoapMg,Ariana Grande - positions (official video),2020-10-23T04:00:10Z,UC0VOyT2OCBKdQhF3BAbZ-1g,ArianaGrandeVevo,10,20.23.10,ariana grande positions|positions ariana grand...,7516529,1485130,10810,140549,https://i.ytimg.com/vi/tcYodQoapMg/default.jpg,False,False,The official “positions” music video by Ariana...,2M58S,US
2,np9Ub1LilKU,Jack Harlow - Tyler Herro [Official Video],2020-10-22T19:00:14Z,UC6vZl7Qj7JglLDmN_7Or-ZQ,Jack Harlow,10,20.23.10,jack harlow|jack rapper|harlow rapper|private ...,1499338,153028,2006,11013,https://i.ytimg.com/vi/np9Ub1LilKU/default.jpg,False,False,Jack Harlow - Tyler HerroListen now: https://J...,3M,US
3,5S4bm3bAt9Y,SURPRISING BEST FRIEND WITH BORAT!!,2020-10-21T19:56:24Z,UCef29bYGgUSoJjVkqhcAPkw,David Dobrik Too,22,20.23.10,[none],5320147,596894,7044,33648,https://i.ytimg.com/vi/5S4bm3bAt9Y/default.jpg,False,False,Thank you Borat for coming over!! I like youWa...,5M55S,US
4,GuEkHIgR46k,Bryson Tiller - Always Forever (Official Video),2020-10-22T16:00:08Z,UCwhe-6skwaZxLomc-U6Wy1w,BrysonTillerVEVO,10,20.23.10,Bryson Tiller 2020|Bryson Tiller Serenity|Brys...,862087,82059,657,4459,https://i.ytimg.com/vi/GuEkHIgR46k/default.jpg,False,False,A N N I V E R S A R Y OUT NOW!Stream/Download:...,2M59S,US


### 2. Cleaning
* 2.1 Feature Engineering

In [2]:
# df[df['title'].str.contains("transit")]
# df['publishedAt'] = pd.to_datetime(df['publishedAt'], format='%Y-%m-%d').dt.tz_localize(None)
# df['trending_date'] = pd.to_datetime(df['trending_date'], format="%y.%d.%m")
# # df['publishedAt'] = df['publishedAt'].dt.date
# df.head()
# df.dtypes
np.shape(df)

(1200, 18)

In [3]:
# Adding Like/Dislike Ratio Column
df['likeRatio'] = (df['likes']-df['dislikes'])/(df['likes']+df['dislikes'])

# Changing the 'publishedAt' and 'trending_date' type from string to datetime type
df['publishedAt'] = pd.to_datetime(df['publishedAt'], format='%Y-%m-%d').dt.tz_localize(None)
df['new_date_published'] = df['publishedAt'].dt.date
df['trending_date'] = pd.to_datetime(df['trending_date'], format="%y.%d.%m")
df['new_date_trending'] = df['trending_date'].dt.date
# df['publishedAt'] = df['publishedAt'].dt.date
df['days_lapse'] = df['new_date_trending'] - df['new_date_published']

# Breaking down 'duration' into Hour, Minutes, and Seconds
df['durationHr'] = df['duration'].str.extract('(\d+)H').fillna(0).astype(int)
df['durationMin'] = df['duration'].str.extract('(\d+)M').fillna(0).astype(int)
df['durationSec'] = df['duration'].str.extract('M(\d+)S').fillna(0).astype(int)

# Adding 'titleLength' Column
df['titleLength'] = df['title'].apply(lambda x: len(str(x)))

# Adding 'tagCount' Column
df.loc[df['tags'].str.count("\|") != 0, 'tagCount'] = df['tags'].str.count("\|") + 1
df.loc[df['tags'].str.count("\|") == 0, 'tagCount'] = 0
df['tagCount'] = df['tagCount'].astype(int)


In [4]:
df = df.drop_duplicates(subset=['video_id'], keep='last')
df[df.video_id == 'tcYodQoapMg']

In [6]:
# Dropping unneeded columns
df.drop(columns=['channelTitle', 'channelId', 'video_id', 'title', 'description',
'channelTitle', 'tags', 'thumbnail_link', 'comments_disabled', 'duration',
'ratings_disabled'], axis=0, inplace=True)

df = df.dropna()
df.isnull().sum()

publishedAt           0
categoryId            0
trending_date         0
view_count            0
likes                 0
dislikes              0
comment_count         0
country               0
likeRatio             0
new_date_published    0
new_date_trending     0
days_lapse            0
durationHr            0
durationMin           0
durationSec           0
titleLength           0
tagCount              0
dtype: int64

In [7]:
df.head()

Unnamed: 0,publishedAt,categoryId,trending_date,view_count,likes,dislikes,comment_count,country,likeRatio,new_date_published,new_date_trending,days_lapse,durationHr,durationMin,durationSec,titleLength,tagCount
108,2020-10-20 12:15:11,17,2020-10-23,488217,8930,486,2639,US,0.896771,2020-10-20,2020-10-23,3 days,0,15,20,83,21
136,2020-10-20 02:00:01,24,2020-10-23,586347,12870,125,818,US,0.980762,2020-10-20,2020-10-23,3 days,0,5,50,98,32
138,2020-10-19 04:51:39,17,2020-10-23,152450,2198,110,571,US,0.904679,2020-10-19,2020-10-23,4 days,0,5,40,93,35
139,2020-10-18 20:35:18,2,2020-10-23,769352,7123,398,705,US,0.894163,2020-10-18,2020-10-23,5 days,0,2,1,27,5
142,2020-10-19 14:31:30,24,2020-10-23,154603,6559,88,1791,US,0.973522,2020-10-19,2020-10-23,4 days,0,31,32,60,37


In [8]:
df.dtypes

publishedAt            datetime64[ns]
categoryId                      int64
trending_date          datetime64[ns]
view_count                      int64
likes                           int64
dislikes                        int64
comment_count                   int64
country                        object
likeRatio                     float64
new_date_published             object
new_date_trending              object
days_lapse            timedelta64[ns]
durationHr                      int64
durationMin                     int64
durationSec                     int64
titleLength                     int64
tagCount                        int64
dtype: object

In [9]:
df.to_csv("../YouTube-Trending/Data/Clean_10.23-27.20.csv", index=False)

### One Hot Encoding Country
Mapping each date to a vector consisting of 0s and 1s 
that denote the absence or presence of the feature

### Dictionary:
* 0 - Canada
* 1 - Great Britain
* 2 - United States

In [10]:
encoder = OneHotEncoder()

enc_df = pd.DataFrame(encoder.fit_transform(df[['country']]).toarray().astype(int))

df = df.join(enc_df)

df.drop(columns=['country'], axis=0, inplace=True)
df.head()

Unnamed: 0,publishedAt,categoryId,trending_date,view_count,likes,dislikes,comment_count,likeRatio,new_date_published,new_date_trending,days_lapse,durationHr,durationMin,durationSec,titleLength,tagCount,0
108,2020-10-20 12:15:11,17,2020-10-23,488217,8930,486,2639,0.896771,2020-10-20,2020-10-23,3 days,0,15,20,83,21,1.0
136,2020-10-20 02:00:01,24,2020-10-23,586347,12870,125,818,0.980762,2020-10-20,2020-10-23,3 days,0,5,50,98,32,1.0
138,2020-10-19 04:51:39,17,2020-10-23,152450,2198,110,571,0.904679,2020-10-19,2020-10-23,4 days,0,5,40,93,35,1.0
139,2020-10-18 20:35:18,2,2020-10-23,769352,7123,398,705,0.894163,2020-10-18,2020-10-23,5 days,0,2,1,27,5,1.0
142,2020-10-19 14:31:30,24,2020-10-23,154603,6559,88,1791,0.973522,2020-10-19,2020-10-23,4 days,0,31,32,60,37,1.0
