## YouTube Trending Project
* ### Machine Learning Models

### Table of Contents:
* 1.Exploratory Data Analysis
* 2.Data Cleaning
* 3.Modeling
    * 3.1 Predicting Likes
        * 3.1.1 Train-Test Split (80:20)
        * 3.1.2 Linear Regreission
        * 3.1.3 Decision Trees
        * 3.1.4 Random Forest
    * 3.2 Predicting Views
        * 3.2.1 Train-Test Split (80:20)
        * 3.2.2 Linear Regreission
        * 3.2.3 Decision Trees
        * 3.2.4 Random Forest
    * 3.3 Predicting Comment Count
        * 3.3.1 Train-Test Split (80:20)
        * 3.3.2 Linear Regreission
        * 3.3.3 Decision Trees
        * 3.3.4 Random Forest

### 3. Machine Learning Models
##### Loading Data and Libraries

In [8]:
import helpers
import pandas as pd
import numpy as np


# Encoding and Data Split
# import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Modeling
from sklearn import metrics
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# Reading the stitched data
df = helpers.load_df("../YouTube-Trending/Data/Curated_US_Data.csv")

df.head()

Unnamed: 0,categoryId,likeRatio,likes_log,views_log,dislikes_log,comment_log,days_lapse,durationHr,durationMin,durationSec,titleLength,tagCount
0,25,0.876818,11.457423,15.708863,8.733755,10.990247,0.0,1,59,15,66,12
1,10,0.985548,14.211013,15.832615,9.288227,11.853311,0.0,0,2,58,42,22
2,10,0.974122,11.938376,14.220534,7.603898,9.306832,1440.0,0,3,0,42,26
3,22,0.976673,13.299495,15.487011,8.859931,10.423709,2880.0,0,5,55,35,0
4,10,0.984114,11.315194,13.667111,6.487684,8.40268,1440.0,0,2,59,47,22


### 3.1 Predicting Likes
#### 3.1.1 Train-Test Split (80:20)
Splitting the data into train and test sets in a 80:20 ratio

In [9]:
X = df.drop(columns=['likes_log'])
y = df['likes_log']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [13]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).drop(['durationHr','durationMin','durationSec'],axis=1).columns
categorical_features = list(X.select_dtypes(include=['object']).columns) + ['durationHr','durationMin','durationSec']

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numeric_features),
        ('categorical', OneHotEncoder(handle_unknown = "ignore"), categorical_features)])

y

0       11.457423
1       14.211013
2       11.938376
3       13.299495
4       11.315194
          ...    
2151     9.270588
2152    10.858884
2153    12.763085
2154     8.556222
2155    11.763357
Name: likes_log, Length: 2156, dtype: float64

In [14]:
print('Numeric Features:', numeric_features)
print('Categorical Features:', categorical_features)

Numeric Features: Index(['categoryId', 'likeRatio', 'views_log', 'dislikes_log', 'comment_log',
       'days_lapse', 'titleLength', 'tagCount'],
      dtype='object')
Categorical Features: ['durationHr', 'durationMin', 'durationSec']


In [None]:
# Setting up encoding pipeline
# encoding_pipeline = ColumnTransformer(
#     transformers=[
#     ('encode_category', ce.HashingEncoder(cols=['categoryId'])),
#     ('encode_other', ce.OneHotEncoder(cols=['durationHr','durationMin','durationSec'])),
# ])


# encoding_pipeline = Pipeline([
#     ('encode_category', ce.HashingEncoder(cols=['categoryId'])),
#     ('encode_other', ce.OneHotEncoder(cols=['durationHr','durationMin','durationSec'])),
# ])

In [31]:
randomForest = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', RandomForestRegressor())])
param_grid = { 
    'regressor__n_estimators' : [100, 200, 400, 600],
    'regressor__max_depth' : [10,20,50,80],
    'regressor__min_samples_leaf': [1,2,5]
}

CV = GridSearchCV(randomForest, param_grid, n_jobs=1,verbose=2)
                  
CV.fit(X, y)  
print(CV.best_params_)    
print(CV.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100 
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100, total=   2.1s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100 
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s
[CV]  regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100, total=   2.1s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100 
[CV]  regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100, total=   2.1s
[CV] regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100 
[CV]  regressor__max_depth=10, regressor__min_samples_leaf=1, regressor__n_estimators=100, total=   2.

KeyboardInterrupt: 

In [28]:
regressors = [
        LinearRegression(),
        DecisionTreeRegressor(max_depth=20, min_samples_leaf=1),
        RandomForestRegressor(max_depth=20, min_samples_leaf=1),
    ]

for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    print("Model Score: %.3f" % pipe.score(X_test, y_test))

LinearRegression()
Model Score: 0.896
DecisionTreeRegressor(max_depth=20)
Model Score: 0.972
RandomForestRegressor(max_depth=20)
Model Score: 0.989


In [None]:
# X_train_encoded = encoding_pipeline.fit_transform(X_train, y_train)
# X_test_encoded = encoding_pipeline.transform(X_test)

In [None]:
# print(X_train_encoded.shape)
# print(X_test_encoded.shape)

(1724, 126)
(432, 126)


In [None]:
# X_test_encoded.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,likeRatio,views_log,...,durationSec_53,durationSec_54,durationSec_55,durationSec_56,durationSec_57,durationSec_58,durationSec_59,durationSec_60,titleLength,tagCount
0,0,0,0,0,0,1,0,0,0.940481,15.186243,...,0,0,0,0,0,0,0,0,28,42
1,0,0,0,1,0,0,0,0,0.946957,13.52447,...,0,0,0,0,0,0,0,0,50,30
2,0,1,0,0,0,0,0,0,0.985512,13.084913,...,0,0,0,0,0,0,0,0,30,8
3,0,0,0,0,0,0,0,1,0.823195,14.050648,...,0,0,0,0,0,0,0,0,47,13
4,1,0,0,0,0,0,0,0,0.973181,14.605644,...,0,0,0,0,0,0,0,0,54,13


#### 3.1.2 Linear Regression

In [None]:
# linreg = LinearRegression()
# linreg.fit(X_train_encoded,y_train)

LinearRegression()

In [None]:
# y_pred = linreg.predict(X_test_encoded)

# compare = pd.DataFrame({'Actual': y_test.values.flatten(), 'Predicted': y_pred.flatten()})

# compare.head(10)

Unnamed: 0,Actual,Predicted
0,11.820785,12.343211
1,9.087721,9.531987
2,10.354053,9.807254
3,10.271389,10.715927
4,11.637185,11.568882
5,12.09092,12.322163
6,13.084304,12.963527
7,10.272047,10.624679
8,11.998384,12.120579
9,9.833119,9.958843


In [None]:
# mae = metrics.mean_absolute_error(y_test,linreg.predict(X_test_encoded))
# mse = metrics.mean_squared_error(y_test,linreg.predict(X_test_encoded))
# rmse = np.sqrt(metrics.mean_squared_error(y_test,linreg.predict(X_test_encoded)))
# r2 = metrics.r2_score(y_test, linreg.predict(X_test_encoded))

# print("mae: ", mae)
# print("mse: ", mse)
# print("rmse: ", rmse)
# print("r2: ", r2)

mae:  0.3263428780810554
mse:  0.19419509497509377
rmse:  0.44067572542073813
r2:  0.911371069230615


In [None]:
# list(zip(X_train_encoded.columns, linreg.coef_))

[('col_0', 0.22184571598911446),
 ('col_1', 0.0065803460718928735),
 ('col_2', 2.0941581801992015e-14),
 ('col_3', -0.3582221714491499),
 ('col_4', -0.15229629932217592),
 ('col_5', 0.17889798930344197),
 ('col_6', -5.88418203051333e-15),
 ('col_7', 0.10319441940732564),
 ('likeRatio', 4.093813056202636),
 ('views_log', 0.3235043492018989),
 ('dislikes_log', 0.3039073016278344),
 ('comment_log', 0.38223137502520166),
 ('days_lapse', -1.6283598569533098e-06),
 ('durationHr_1', -0.03190396241085808),
 ('durationHr_2', -0.003325169058483346),
 ('durationHr_3', 0.035229131469342645),
 ('durationMin_1', -0.2630330522427655),
 ('durationMin_2', -0.31781049298437974),
 ('durationMin_3', 0.004572717895430678),
 ('durationMin_4', 0.058755485181771205),
 ('durationMin_5', 0.008297439392097927),
 ('durationMin_6', -0.025828444376245874),
 ('durationMin_7', 0.17364040398650366),
 ('durationMin_8', -0.11188650296069486),
 ('durationMin_9', 0.09752170767805785),
 ('durationMin_10', 0.015167120095856

#### 3.1.3 Decisions Tree

In [None]:
# decTreeReg = DecisionTreeRegressor()
# decTreeReg.fit(X_train_encoded,y_train)

DecisionTreeRegressor()

In [None]:
# mae = metrics.mean_absolute_error(y_test, decTreeReg.predict(X_test_encoded))
# mse = metrics.mean_squared_error(y_test, decTreeReg.predict(X_test_encoded))
# rmse = np.sqrt(metrics.mean_squared_error(y_test, decTreeReg.predict(X_test_encoded)))
# r2 = metrics.r2_score(y_test, decTreeReg.predict(X_test_encoded))

# print("mae: ", mae)
# print("mse: ", mse)
# print("rmse: ", rmse)
# print("r2: ", r2)

mae:  0.14736859069668257
mse:  0.06389254735978099
rmse:  0.2527697516709248
r2:  0.9708400041857085


#### 3.1.4 Random Forest

In [None]:
# randomForestReg = RandomForestRegressor()
# randomForestReg.fit(X_train_encoded,y_train)

RandomForestRegressor()

In [None]:
# mae = metrics.mean_absolute_error(y_test, randomForestReg.predict(X_test_encoded))
# mse = metrics.mean_squared_error(y_test, randomForestReg.predict(X_test_encoded))
# rmse = np.sqrt(metrics.mean_squared_error(y_test, randomForestReg.predict(X_test_encoded)))
# r2 = metrics.r2_score(y_test, randomForestReg.predict(X_test_encoded))

# print("mae: ", mae)
# print("mse: ", mse)
# print("rmse: ", rmse)
# print("r2: ", r2)

mae:  0.09771746932085933
mse:  0.024868897567348565
rmse:  0.15769875575713516
r2:  0.9886500542091954


#### 3.1.4.1 Feature Importance

In [None]:
# pd.DataFrame({'Feature':X_test_encoded.columns, 
#               'Importance':randomForestReg.feature_importances_}).sort_values(by='Importance',ascending=False)

Unnamed: 0,Feature,Importance
11,comment_log,6.467589e-01
8,likeRatio,1.248983e-01
9,views_log,1.185276e-01
10,dislikes_log,8.975992e-02
124,titleLength,5.403227e-03
...,...,...
44,durationMin_29,2.798308e-07
63,durationMin_48,2.100153e-08
61,durationMin_46,1.255587e-08
6,col_6,0.000000e+00


#### 3.1.4.2 Feature Standardization

In [None]:
# df.describe()

Unnamed: 0,categoryId,likeRatio,likes_log,views_log,dislikes_log,comment_log,days_lapse,durationHr,durationMin,durationSec,titleLength,tagCount
count,2156.0,2156.0,2156.0,2156.0,2156.0,2156.0,2156.0,2156.0,2156.0,2156.0,2156.0,2156.0
mean,18.556586,0.942988,10.947442,14.085067,6.96927,8.488577,5428.719852,0.026438,9.983766,28.29731,51.350186,19.162338
std,7.490631,0.094645,1.412178,1.130908,1.369583,1.376559,2948.657838,0.196835,9.511123,18.19911,20.291342,12.662845
min,1.0,-0.292719,6.126869,10.412021,3.044522,1.791759,0.0,0.0,0.0,0.0,4.0,0.0
25%,10.0,0.939105,10.011411,13.266977,6.000795,7.60887,2880.0,0.0,3.0,13.0,38.0,9.0
50%,22.0,0.964657,10.950533,13.99876,6.826545,8.425078,5760.0,0.0,8.0,28.0,48.0,19.0
75%,24.0,0.979908,11.841588,14.829074,7.865955,9.360827,7200.0,0.0,14.0,44.0,62.25,28.0
max,29.0,0.998719,14.925165,17.582435,11.693854,13.542816,27360.0,2.0,59.0,59.0,100.0,60.0


In [None]:
scaler = StandardScaler()
# scaler.fit(df)

StandardScaler()

In [None]:
# pipeline = Pipeline([
#     ('hash', ce.HashingEncoder(cols=['categoryId'])),
#     ('onehot', ce.OneHotEncoder(cols=['durationHr','durationMin','durationSec'])),
#     ('scale', helpers.scaling(X,y)),
#     ('forest',randomForestReg)
# ])

#### 3.1.4.2 Parameter Optimization
##### GridSearch

In [None]:
# parameters = {'forest__max_depth':[10,20],
#               'forest__min_samples_leaf':[25,50]}

In [None]:
# gs = GridSearchCV(pipeline,parameters)

# gs.fit(X_train,y_train)

# scores = pd.DataFrame(gs.cv_results_).filter(regex='param_+|mean_test_score').sort_values('mean_test_score',
#     ascending=False).reset_index().drop(['index'],axis=1)