## YouTube Trending Project
* ### Machine Learning Models

### Table of Contents:
* 1.Exploratory Data Analysis
* 2.Data Cleaning
* 3.Modeling
    * 3.1 Predicting Likes
        * 3.1.1 Pre-processing Data
            * 3.1.1.1 Train-Test Split (80:20)
            * 3.1.1.2 Initializing Pre-processing Pipeline
        * 3.1.2 Hyperparameter Tuning (Gridsearch)
        * 3.1.3 Regressors
            * 3.1.3.1 Linear Regression
            * 3.1.3.2 Random Forest
            * 3.1.3.3 XGBoost
        * 3.1.4 Random Forest
            * 3.1.4.1 Feature Importance
        * 3.1.5 Likes Evaluation
    * 3.2 Predicting Views
        * 3.2.1 Pre-processing Data
            * 3.2.1.1 Train-Test Split (80:20)
            * 3.2.1.2 Initializing Pre-processing Pipeline
        * 3.2.2 Hyperparameter Tuning (Gridsearch)
        * 3.2.3 Regressors
            * 3.2.3.1 Linear Regression
            * 3.2.3.2 Random Forest
            * 3.2.3.3 XGBoost
        * 3.2.4 Random Forest
            * 3.2.4.1 Feature Importance
        * 3.2.5 Views Evaluation
    * 3.3 Predicting Comment Count
        * 3.3.1 Pre-processing Data
            * 3.3.1.1 Train-Test Split (80:20)
            * 3.3.1.2 Initializing Pre-processing Pipeline
        * 3.3.2 Hyperparameter Tuning (Gridsearch)
        * 3.3.3 Regressors
            * 3.3.3.1 Linear Regression
            * 3.3.3.2 Decision Trees
            * 3.3.3.3 Random Forest
        * 3.3.4 Random Forest
            * 3.1.4.1 Feature Importance

### 3. Machine Learning Models
##### Loading Data and Libraries

In [1]:
import helpers
import pandas as pd
import numpy as np
import seaborn as sns


# Encoding and Data Split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Modeling
from sklearn import metrics
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Tuning
import optuna
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Reading the stitched data
df = helpers.load_df("Data/Curated_US_Data.csv")

df.head()

Unnamed: 0,categoryId,likeRatio,likes_log,views_log,dislikes_log,comment_log,days_lapse,durationHr,durationMin,durationSec,titleLength,tagCount
0,25,0.876818,11.457423,15.708863,8.733755,10.990247,0.0,1,59,15,66,12
1,10,0.985548,14.211013,15.832615,9.288227,11.853311,0.0,0,2,58,42,22
2,10,0.974122,11.938376,14.220534,7.603898,9.306832,1.0,0,3,0,42,26
3,22,0.976673,13.299495,15.487011,8.859931,10.423709,2.0,0,5,55,35,0
4,10,0.984114,11.315194,13.667111,6.487684,8.40268,1.0,0,2,59,47,22


### 3.1 Predicting Likes
#### 3.1.1 Preprocessing Data
##### 3.1.1.1 Train-Test Split (80:20)
Splitting the data into train and test sets in a 80:20 ratio

In [2]:
X = df.drop(columns=['likes_log'])
y = df['likes_log']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

##### 3.1.1.2 Initializing Preprocessing Pipeline
Scaling numercal data and encoding categorical data

In [4]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).drop(['durationHr','durationMin','durationSec', 'categoryId'],axis=1).columns
categorical_features = list(X.select_dtypes(include=['object']).columns) + ['durationHr','durationMin','durationSec', 'categoryId']

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numeric_features),
        ('categorical', OneHotEncoder(handle_unknown = "ignore"), categorical_features)])

y

0       11.457423
1       14.211013
2       11.938376
3       13.299495
4       11.315194
          ...    
2731    12.388952
2732    10.343966
2733    13.291704
2734    10.807706
2735     8.847791
Name: likes_log, Length: 2736, dtype: float64

In [5]:
print('Numeric Features:', numeric_features)
print('Categorical Features:', categorical_features)

Numeric Features: Index(['likeRatio', 'views_log', 'dislikes_log', 'comment_log', 'days_lapse',
       'titleLength', 'tagCount'],
      dtype='object')
Categorical Features: ['durationHr', 'durationMin', 'durationSec', 'categoryId']


#### 3.1.2 Hyperparameter Tuning (Optuna)
Using bayesian hyperparameter optimization to find optimal parameters

In [6]:
# Define Objective Function to be Maximized
def rfObjective(trial):
    # Define Random Forest Parameters
    rfParams={
        'n_estimators' : trial.suggest_int('n_estimators', 100, 500),
        'max_depth' : trial.suggest_int('max_depth', 1, 50),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1,15),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2,15)
    }
    
    # Establish Random Forest Regressor Pipeline
    rfPipe = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(
        **rfParams
    ))])
    
    for step in range(100):
        rfPipe.fit(X_train,y_train)
        
        # Report Intermediate Objective Value
        intermediate_value = rfPipe.score(X_test, y_test)
        trial.report(intermediate_value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        return intermediate_value

# Running the Study
rfStudy = optuna.create_study(direction='maximize')
rfStudy.optimize(rfObjective, n_trials=100)

# Calculating the pruned and completed trials
pruned_trials = [t for t in rfStudy.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in rfStudy.trials if t.state == optuna.trial.TrialState.COMPLETE]

rfTrial = rfStudy.best_trial

print('Number of finished trials: ', len(rfStudy.trials))
print('Number of pruned trials: ', len(pruned_trials))
print('Number of complete trials: ', len(complete_trials))

print('Accuracy: {}'.format(rfTrial.value))
print("Best hyperparameters: {}".format(rfTrial.params))

[32m[I 2021-01-26 15:01:08,499][0m A new study created in memory with name: no-name-08d5cfc9-fa97-436b-a65f-9adf79506ce7[0m
[32m[I 2021-01-26 15:01:09,024][0m Trial 0 finished with value: 0.4961520342512937 and parameters: {'n_estimators': 138, 'max_depth': 1, 'min_samples_leaf': 14, 'min_samples_split': 9}. Best is trial 0 with value: 0.4961520342512937.[0m
[32m[I 2021-01-26 15:01:25,278][0m Trial 1 finished with value: 0.9891695699623447 and parameters: {'n_estimators': 259, 'max_depth': 23, 'min_samples_leaf': 1, 'min_samples_split': 3}. Best is trial 1 with value: 0.9891695699623447.[0m
[32m[I 2021-01-26 15:01:32,174][0m Trial 2 finished with value: 0.9685201435338865 and parameters: {'n_estimators': 265, 'max_depth': 31, 'min_samples_leaf': 11, 'min_samples_split': 13}. Best is trial 1 with value: 0.9891695699623447.[0m
[32m[I 2021-01-26 15:01:38,220][0m Trial 3 finished with value: 0.9574312850365042 and parameters: {'n_estimators': 328, 'max_depth': 38, 'min_sample

In [7]:
# Define Objective Function to be Maximized
def xgbObjective(trial):
    # Define XGBoost Parameters
    xgbParams = {
        'n_estimators' : trial.suggest_int('n_estimators', 100,500),
        'max_depth' : trial.suggest_int('max_depth', 1, 20),
        'eta' : trial.suggest_uniform('eta', 0.01, 1), # learning_rate
        'subsample': trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1),
        'gamma': trial.suggest_int('gamma', 0, 10), # min_split_loss
        'min_child_weight' : trial.suggest_uniform('min_child_weight', 0.1, 1.0)
    }

    # Establish XGBoost Regressor Pipeline
    xgbPipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', xgb.XGBRegressor(
        **xgbParams
    ))])
    
    for step in range(100):
        xgbPipe.fit(X_train,y_train)
        
        # Report Intermediate Objective Value
        intermediate_value = xgbPipe.score(X_test, y_test)
        trial.report(intermediate_value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        return intermediate_value

# Running the Study
xgbStudy = optuna.create_study(direction='maximize')
xgbStudy.optimize(xgbObjective, n_trials=100)

# Calculating the pruned and completed trials
pruned_trials = [t for t in xgbStudy.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in xgbStudy.trials if t.state == optuna.trial.TrialState.COMPLETE]

xgbTrial = xgbStudy.best_trial

print('Number of finished trials: ', len(xgbStudy.trials))
print('Number of pruned trials: ', len(pruned_trials))
print('Number of complete trials: ', len(complete_trials))

print('Accuracy: {}'.format(xgbTrial.value))
print("Best hyperparameters: {}".format(xgbTrial.params))
    

[32m[I 2021-01-26 15:18:21,432][0m A new study created in memory with name: no-name-6766eaa5-9f6e-4951-b3c4-6fd3863747fc[0m
[32m[I 2021-01-26 15:21:38,410][0m Trial 0 finished with value: 0.9356482341939227 and parameters: {'n_estimators': 265, 'max_depth': 8, 'eta': 0.5910523530396874, 'subsample': 0.2726218163862731, 'colsample_bytree': 0.8766756183053821, 'gamma': 8, 'min_child_weight': 0.697183442368207}. Best is trial 0 with value: 0.9356482341939227.[0m


[32m[I 2021-01-26 15:35:35,134][0m Trial 1 finished with value: 0.9009817935884837 and parameters: {'n_estimators': 499, 'max_depth': 16, 'eta': 0.6761289067152603, 'subsample': 0.6540308153190778, 'colsample_bytree': 0.22628308130286462, 'gamma': 6, 'min_child_weight': 0.13187309357168708}. Best is trial 0 with value: 0.9356482341939227.[0m
[32m[I 2021-01-26 15:45:06,224][0m Trial 2 finished with value: 0.9181867461951714 and parameters: {'n_estimators': 420, 'max_depth': 12, 'eta': 0.8626726376957146, 'subsample': 0.984533550581923, 'colsample_bytree': 0.2677361085418226, 'gamma': 9, 'min_child_weight': 0.6746229741541383}. Best is trial 0 with value: 0.9356482341939227.[0m
[32m[I 2021-01-26 15:58:19,365][0m Trial 3 finished with value: 0.9402416553832936 and parameters: {'n_estimators': 308, 'max_depth': 20, 'eta': 0.8398714819966272, 'subsample': 0.9657971310903626, 'colsample_bytree': 0.9495150408677517, 'gamma': 8, 'min_child_weight': 0.9742474854398435}. Best is trial 3 

#### 3.1.3 Regressors
* ##### 3.1.3.1 Linear Regression
* ##### 3.1.3.2 Random Forest
* ##### 3.1.3.3 XGBoost


In [8]:
regressors = [
        LinearRegression(),
        RandomForestRegressor(**rfTrial.params),
        xgb.XGBRegressor(**xgbTrial.params),
    ]

for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    
    # y_pred = pipe.predict(X_test)

    # d1 = {'True Labels': y_test, 'Predicted Labels': y_pred}
    # SK = pd.DataFrame(data = d1)
    # print(SK)

    print("Model Score: %.3f" % pipe.score(X_test, y_test))

    mae = metrics.mean_absolute_error(y_test, pipe.predict(X_test))
    mse = metrics.mean_squared_error(y_test,pipe.predict(X_test))
    rmse = np.sqrt(metrics.mean_squared_error(y_test, pipe.predict(X_test)))
    print("mae: ", mae)
    print("mse: ", mse)
    print("rmse: ", rmse, "\n")

    
    # lm1 = sns.lmplot(x="True Labels", y="Predicted Labels", data = SK, size = 10)
    # fig1 = lm1.fig 
    # fig1.suptitle("Sklearn ", fontsize=18)
    # sns.set(font_scale = 1.5)


LinearRegression()
Model Score: 0.930
mae:  0.307648610886282
mse:  0.16176731542590703
rmse:  0.40220307734514793 

RandomForestRegressor(max_depth=50, n_estimators=274)
Model Score: 0.989
mae:  0.09461742502033227
mse:  0.026239461729822704
rmse:  0.16198599238768366 

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9866952798020608,
             eta=0.07462264455597645, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.074622646, max_delta_step=0, max_depth=7,
             min_child_weight=0.2100895883715349, missing=nan,
             monotone_constraints='()', n_estimators=346, n_jobs=4,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.1702457663739264,
             tree_method='exact', validate_parameters=1, verbosity=None)
Model Score: 0.996
mae:  0.07228691574125996
mse:  0

LinearRegression()
Model Score: 0.930
mae:  0.30764861304915814
mse:  0.1617673139041401
rmse:  0.40220307545335865 

RandomForestRegressor(max_depth=22, n_estimators=216)
Model Score: 0.989
mae:  0.09238404361706648
mse:  0.024398276369292347
rmse:  0.1561994762132458 

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9317901454660239,
             eta=0.04840992742257614, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.0484099276, max_delta_step=0, max_depth=9,
             min_child_weight=0.3038992027029832, missing=nan,
             monotone_constraints='()', n_estimators=430, n_jobs=4,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.5872017764679212,
             tree_method='exact', validate_parameters=1, verbosity=None)
Model Score: 0.996
mae:  0.06322841764964314
mse:  

#### 3.1.4 Random Forest Regressor

In [9]:
reg = RandomForestRegressor(**{'n_estimators': 456, 'max_depth': 16, 'min_samples_leaf': 1, 'min_samples_split': 3}, oob_score=True)

pipe = Pipeline(steps=[('preprocessor', preprocessor),
              ('regressor', reg)])
pipe.fit(X_train, y_train)   
print(reg)

print("Model Train Score: %.3f" % pipe.score(X_train, y_train))
print("Model OOB Score: %.3f" % reg.oob_score_)
print("Model Test Score: %.3f" % pipe.score(X_test, y_test))

RandomForestRegressor(max_depth=16, min_samples_split=3, n_estimators=456,
                      oob_score=True)
Model Train Score: 0.998
Model OOB Score: 0.986
Model Test Score: 0.988


RandomForestRegressor(max_depth=16, min_samples_split=3, n_estimators=456,
                      oob_score=True)
Model Train Score: 0.998
Model OOB Score: 0.986
Model Test Score: 0.989


##### 3.1.4.1 Feature Importance

In [10]:
pd.DataFrame(zip(X.columns,reg.feature_importances_),columns=['feature','importance']).sort_values(by='importance',ascending=False)


Unnamed: 0,feature,importance
3,dislikes_log,0.5883689
1,likeRatio,0.1559548
0,categoryId,0.1400833
2,views_log,0.09250421
5,days_lapse,0.003694765
6,durationHr,0.002194229
4,comment_log,0.00114896
8,durationSec,0.0004434084
7,durationMin,0.0003343907
9,titleLength,1.275014e-05


Unnamed: 0,feature,importance
3,dislikes_log,0.587355
1,likeRatio,0.156939
0,categoryId,0.139752
2,views_log,0.092276
5,days_lapse,0.003493
6,durationHr,0.002267
4,comment_log,0.0011
8,durationSec,0.000359
7,durationMin,0.000345
9,titleLength,7e-06


#### 3.1.5 Likes Evaluation

In [11]:
eval = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb.XGBRegressor(**xgbTrial.params))])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test) 

mae = metrics.mean_absolute_error(y_test,y_pred)
mse = metrics.mean_squared_error(y_test,y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
r2 = metrics.r2_score(y_test, y_pred)

print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

mae:  0.09448494373046938
mse:  0.02556721076235017
rmse:  0.15989750080082604
r2:  0.9889741601004047


mae:  0.09387688085459024
mse:  0.024949801805981348
rmse:  0.15795506261586348
r2:  0.9892404172361077


In [12]:
df = pd.DataFrame(data=list(zip(list(y_test), list(y_pred))),columns=['actual','predicted'])

#Unlog Values
for col in df.columns:
    df[col] = df[col].apply(lambda x: np.e**x)

df

Unnamed: 0,actual,predicted
0,16246.0,1.810635e+04
1,477871.0,4.191451e+05
2,162807.0,1.762194e+05
3,1267051.0,1.072060e+06
4,199667.0,2.491322e+05
...,...,...
543,20055.0,2.826144e+04
544,51067.0,6.366254e+04
545,6624.0,6.481463e+03
546,23598.0,2.436071e+04


Unnamed: 0,actual,predicted
0,16246.0,1.904417e+04
1,477871.0,4.169234e+05
2,162807.0,1.745432e+05
3,1267051.0,1.067131e+06
4,199667.0,2.450246e+05
...,...,...
543,20055.0,2.871074e+04
544,51067.0,6.187270e+04
545,6624.0,6.368226e+03
546,23598.0,2.424724e+04


### 3.2 Predicting Views
#### 3.2.1 Preprocessing Data
##### 3.2.1.1 Train-Test Split (80:20)
Splitting the data into train and test sets in a 80:20 ratio

In [13]:
df = helpers.load_df("Data/Curated_US_Data.csv")

X = df.drop(columns=['views_log'])
y = df['views_log']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

##### 3.2.1.2 Initializing Preprocessing Pipeline
Scaling numercal data and encoding categorical data

In [15]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).drop(['durationHr','durationMin','durationSec', 'categoryId'],axis=1).columns
categorical_features = list(X.select_dtypes(include=['object']).columns) + ['durationHr','durationMin','durationSec', 'categoryId']

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numeric_features),
        ('categorical', OneHotEncoder(handle_unknown = "ignore"), categorical_features)])

y

0       15.708863
1       15.832615
2       14.220534
3       15.487011
4       13.667111
          ...    
2731    14.933087
2732    12.728645
2733    16.079583
2734    13.822260
2735    12.871561
Name: views_log, Length: 2736, dtype: float64

0       15.708863
1       15.832615
2       14.220534
3       15.487011
4       13.667111
          ...    
2731    14.933087
2732    12.728645
2733    16.079583
2734    13.822260
2735    12.871561
Name: views_log, Length: 2736, dtype: float64

In [16]:
print('Numeric Features:', numeric_features)
print('Categorical Features:', categorical_features)

Numeric Features: Index(['likeRatio', 'likes_log', 'dislikes_log', 'comment_log', 'days_lapse',
       'titleLength', 'tagCount'],
      dtype='object')
Categorical Features: ['durationHr', 'durationMin', 'durationSec', 'categoryId']


Numeric Features: Index(['likeRatio', 'likes_log', 'dislikes_log', 'comment_log', 'days_lapse',
       'titleLength', 'tagCount'],
      dtype='object')
Categorical Features: ['durationHr', 'durationMin', 'durationSec', 'categoryId']


#### 3.2.2 Hyperparameter Tuning (Optuna)
Using bayesian hyperparameter optimization to find optimal parameters

In [17]:
# Define Objective Function to be Maximized
def rfObjective(trial):
    # Define Random Forest Parameters
    rfParams={
        'n_estimators' : trial.suggest_int('n_estimators', 100, 500),
        'max_depth' : trial.suggest_int('max_depth', 1, 50),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1,15),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2,15)
    }
    
    # Establish Random Forest Regressor Pipeline
    rfPipe = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(
        **rfParams
    ))])
    
    for step in range(100):
        rfPipe.fit(X_train,y_train)
        
        # Report Intermediate Objective Value
        intermediate_value = rfPipe.score(X_test, y_test)
        trial.report(intermediate_value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        return intermediate_value

# Running the Study
rfStudy = optuna.create_study(direction='maximize')
rfStudy.optimize(rfObjective, n_trials=100)

# Calculating the pruned and completed trials
pruned_trials = [t for t in rfStudy.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in rfStudy.trials if t.state == optuna.trial.TrialState.COMPLETE]

rfTrial = rfStudy.best_trial

print('Number of finished trials: ', len(rfStudy.trials))
print('Number of pruned trials: ', len(pruned_trials))
print('Number of complete trials: ', len(complete_trials))

print('Accuracy: {}'.format(rfTrial.value))
print("Best hyperparameters: {}".format(rfTrial.params))

[32m[I 2021-01-23 15:37:33,475][0m A new study created in memory with name: no-name-da086f27-a7d9-4e9e-8502-4c89492c7471[0m
[32m[I 2021-01-23 15:37:39,767][0m Trial 0 finished with value: 0.8786052152835335 and parameters: {'n_estimators': 401, 'max_depth': 12, 'min_samples_leaf': 13, 'min_samples_split': 14}. Best is trial 0 with value: 0.8786052152835335.[0m
[32m[I 2021-01-23 15:37:47,660][0m Trial 1 finished with value: 0.8895085868053441 and parameters: {'n_estimators': 405, 'max_depth': 11, 'min_samples_leaf': 9, 'min_samples_split': 4}. Best is trial 1 with value: 0.8895085868053441.[0m
[32m[I 2021-01-23 15:37:54,028][0m Trial 2 finished with value: 0.9006515621039455 and parameters: {'n_estimators': 261, 'max_depth': 39, 'min_samples_leaf': 5, 'min_samples_split': 14}. Best is trial 2 with value: 0.9006515621039455.[0m
[32m[I 2021-01-23 15:37:56,707][0m Trial 3 finished with value: 0.8837690175553792 and parameters: {'n_estimators': 152, 'max_depth': 22, 'min_sampl

[32m[I 2021-01-26 19:48:50,859][0m A new study created in memory with name: no-name-cea2909c-b5cc-4bcf-bde8-f764b68a95a2[0m
[32m[I 2021-01-26 19:49:05,771][0m Trial 0 finished with value: 0.9248883387404547 and parameters: {'n_estimators': 402, 'max_depth': 16, 'min_samples_leaf': 2, 'min_samples_split': 4}. Best is trial 0 with value: 0.9248883387404547.[0m
[32m[I 2021-01-26 19:49:13,557][0m Trial 1 finished with value: 0.889750573627805 and parameters: {'n_estimators': 393, 'max_depth': 25, 'min_samples_leaf': 9, 'min_samples_split': 7}. Best is trial 0 with value: 0.9248883387404547.[0m
[32m[I 2021-01-26 19:49:14,808][0m Trial 2 finished with value: 0.8470004574743338 and parameters: {'n_estimators': 206, 'max_depth': 4, 'min_samples_leaf': 10, 'min_samples_split': 7}. Best is trial 0 with value: 0.9248883387404547.[0m
[32m[I 2021-01-26 19:49:18,343][0m Trial 3 finished with value: 0.9108198381110835 and parameters: {'n_estimators': 122, 'max_depth': 35, 'min_samples_l

In [18]:
# Define Objective Function to be Maximized
def xgbObjective(trial):
    # Define XGBoost Parameters
    xgbParams = {
        'n_estimators' : trial.suggest_int('n_estimators', 100,500),
        'max_depth' : trial.suggest_int('max_depth', 1, 20),
        'eta' : trial.suggest_uniform('eta', 0.01, 1), # learning_rate
        'subsample': trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1),
        'gamma': trial.suggest_int('gamma', 0, 10), # min_split_loss
        'min_child_weight' : trial.suggest_uniform('min_child_weight', 0.1, 1.0)
    }

    # Establish XGBoost Regressor Pipeline
    xgbPipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', xgb.XGBRegressor(
        **xgbParams
    ))])
    
    for step in range(100):
        xgbPipe.fit(X_train,y_train)
        
        # Report Intermediate Objective Value
        intermediate_value = xgbPipe.score(X_test, y_test)
        trial.report(intermediate_value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        return intermediate_value

# Running the Study
xgbStudy = optuna.create_study(direction='maximize')
xgbStudy.optimize(xgbObjective, n_trials=100)

# Calculating the pruned and completed trials
pruned_trials = [t for t in xgbStudy.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in xgbStudy.trials if t.state == optuna.trial.TrialState.COMPLETE]

xgbTrial = xgbStudy.best_trial

print('Number of finished trials: ', len(xgbStudy.trials))
print('Number of pruned trials: ', len(pruned_trials))
print('Number of complete trials: ', len(complete_trials))

print('Accuracy: {}'.format(xgbTrial.value))
print("Best hyperparameters: {}".format(xgbTrial.params))
    

[32m[I 2021-01-23 16:00:17,388][0m A new study created in memory with name: no-name-11a1ee32-a567-4f7b-8430-df8ab42a2c85[0m
[32m[I 2021-01-23 16:00:19,047][0m Trial 0 finished with value: 0.7708551723534607 and parameters: {'n_estimators': 293, 'max_depth': 14, 'eta': 0.7210231027900509, 'subsample': 0.3643059197932166, 'colsample_bytree': 0.2368432676573281, 'gamma': 3, 'min_child_weight': 0.6579351182020977}. Best is trial 0 with value: 0.7708551723534607.[0m
[32m[I 2021-01-23 16:00:19,615][0m Trial 1 finished with value: 0.8356468561201909 and parameters: {'n_estimators': 479, 'max_depth': 1, 'eta': 0.8241316885713345, 'subsample': 0.9188848679940332, 'colsample_bytree': 0.44737400606018396, 'gamma': 7, 'min_child_weight': 0.7474909694891879}. Best is trial 1 with value: 0.8356468561201909.[0m
[32m[I 2021-01-23 16:00:22,831][0m Trial 2 finished with value: 0.8915907782958484 and parameters: {'n_estimators': 232, 'max_depth': 18, 'eta': 0.3985880873105458, 'subsample': 0.4

[32m[I 2021-01-26 20:06:02,827][0m A new study created in memory with name: no-name-68e60dc2-5a53-43c0-b1cb-2c430a2acb4f[0m
[32m[I 2021-01-26 20:06:25,229][0m Trial 0 finished with value: 0.8448870355167081 and parameters: {'n_estimators': 124, 'max_depth': 2, 'eta': 0.15216321076914696, 'subsample': 0.8050350157737324, 'colsample_bytree': 0.7133024867213769, 'gamma': 9, 'min_child_weight': 0.5587401403668543}. Best is trial 0 with value: 0.8448870355167081.[0m
[32m[I 2021-01-26 20:11:23,386][0m Trial 1 finished with value: 0.9389832874676141 and parameters: {'n_estimators': 287, 'max_depth': 17, 'eta': 0.1995941791274316, 'subsample': 0.7548424534257852, 'colsample_bytree': 0.5477030505917991, 'gamma': 0, 'min_child_weight': 0.6603112016863432}. Best is trial 1 with value: 0.9389832874676141.[0m
[32m[I 2021-01-26 20:21:37,120][0m Trial 2 finished with value: 0.8716328772102366 and parameters: {'n_estimators': 425, 'max_depth': 18, 'eta': 0.44444557758129644, 'subsample': 0.

#### 3.2.3 Regressors
* ##### 3.2.3.1 Linear Regression
* ##### 3.2.3.2 Random Forest
* ##### 3.2.3.3 XGBoost


In [19]:
regressors = [
        LinearRegression(),
        RandomForestRegressor(**rfTrial.params),
        xgb.XGBRegressor(**xgbTrial.params),
    ]

for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    
    # y_pred = pipe.predict(X_test)

    # d1 = {'True Labels': y_test, 'Predicted Labels': y_pred}
    # SK = pd.DataFrame(data = d1)
    # print(SK)

    print("Model Score: %.3f" % pipe.score(X_test, y_test))

    mae = metrics.mean_absolute_error(y_test, pipe.predict(X_test))
    mse = metrics.mean_squared_error(y_test,pipe.predict(X_test))
    rmse = np.sqrt(metrics.mean_squared_error(y_test, pipe.predict(X_test)))
    print("mae: ", mae)
    print("mse: ", mse)
    print("rmse: ", rmse, "\n")

    
    # lm1 = sns.lmplot(x="True Labels", y="Predicted Labels", data = SK, size = 10)
    # fig1 = lm1.fig 
    # fig1.suptitle("Sklearn ", fontsize=18)
    # sns.set(font_scale = 1.5)


LinearRegression()
Model Score: 0.879
mae:  0.29447569332894336
mse:  0.15599693097797418
rmse:  0.39496446799424145 

RandomForestRegressor(max_depth=44, n_estimators=404)
Model Score: 0.932
mae:  0.20357572327341278
mse:  0.08797270984160414
rmse:  0.29660193836454296 

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.976991139780512,
             eta=0.1613191848337776, gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.161319181,
             max_delta_step=0, max_depth=12,
             min_child_weight=0.9797499715072371, missing=nan,
             monotone_constraints='()', n_estimators=493, n_jobs=4,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.7313280457980748,
             tree_method='exact', validate_parameters=1, verbosity=None)
Model Score: 0.955
mae:  0.15436871229449495
mse:  0

LinearRegression()
Model Score: 0.879
mae:  0.2944756775768164
mse:  0.15599691668124369
rmse:  0.3949644498954858 

RandomForestRegressor(max_depth=36, n_estimators=221)
Model Score: 0.933
mae:  0.20281309518888233
mse:  0.08709366096278136
rmse:  0.2951163515679559 

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9948515768867403,
             eta=0.10452481073587663, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.104524814, max_delta_step=0, max_depth=14,
             min_child_weight=0.9927383263026518, missing=nan,
             monotone_constraints='()', n_estimators=346, n_jobs=4,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.3040423395292238,
             tree_method='exact', validate_parameters=1, verbosity=None)
Model Score: 0.951
mae:  0.16286456727787346
mse:  0.

#### 3.2.4 Random Forest Regressor

In [20]:
reg = RandomForestRegressor(**rfTrial.params, oob_score=True)

pipe = Pipeline(steps=[('preprocessor', preprocessor),
              ('regressor', reg)])
pipe.fit(X_train, y_train)   
print(reg)

print("Model Train Score: %.3f" % pipe.score(X_train, y_train))
print("Model OOB Score: %.3f" % reg.oob_score_)
print("Model Test Score: %.3f" % pipe.score(X_test, y_test))

RandomForestRegressor(max_depth=44, n_estimators=404, oob_score=True)
Model Train Score: 0.990
Model OOB Score: 0.925
Model Test Score: 0.933


RandomForestRegressor(max_depth=36, n_estimators=221, oob_score=True)
Model Train Score: 0.990
Model OOB Score: 0.924
Model Test Score: 0.931


##### 3.2.4.1 Feature Importance

In [21]:
pd.DataFrame(zip(X.columns,reg.feature_importances_),columns=['feature','importance']).sort_values(by='importance',ascending=False)


Unnamed: 0,feature,importance
2,likes_log,0.669329
1,likeRatio,0.192385
3,dislikes_log,0.017139
5,days_lapse,0.015549
0,categoryId,0.014636
6,durationHr,0.013991
4,comment_log,0.010549
9,titleLength,0.000459
7,durationMin,0.00035
8,durationSec,0.000189


Unnamed: 0,feature,importance
2,likes_log,0.667234
1,likeRatio,0.195103
3,dislikes_log,0.016736
5,days_lapse,0.015715
0,categoryId,0.013967
6,durationHr,0.013612
4,comment_log,0.010679
9,titleLength,0.000586
7,durationMin,0.000296
8,durationSec,0.000154


#### 3.2.5 Views Evaluation

In [22]:
eval = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb.XGBRegressor(**xgbTrial.params))])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test) 

mae = metrics.mean_absolute_error(y_test,y_pred)
mse = metrics.mean_squared_error(y_test,y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
r2 = metrics.r2_score(y_test, y_pred)

print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

mae:  0.20313152622248157
mse:  0.0878991672254876
rmse:  0.2964779371647874
r2:  0.9318803516434898


mae:  0.2034548023235258
mse:  0.08823063776938123
rmse:  0.2970364249875446
r2:  0.9316234702917854


In [23]:
df = pd.DataFrame(data=list(zip(list(y_test), list(y_pred))),columns=['actual','predicted'])

#Unlog Values
for col in df.columns:
    df[col] = df[col].apply(lambda x: np.e**x)

df

Unnamed: 0,actual,predicted
0,1138317.0,1.554690e+06
1,6466733.0,6.153997e+06
2,3670365.0,3.772248e+06
3,10925330.0,1.354209e+07
4,10835047.0,6.629584e+06
...,...,...
543,2366855.0,2.135146e+06
544,1020321.0,7.641125e+05
545,902733.0,5.566010e+05
546,696658.0,5.336883e+05


Unnamed: 0,actual,predicted
0,1138317.0,1.616282e+06
1,6466733.0,6.121290e+06
2,3670365.0,3.908597e+06
3,10925330.0,1.360873e+07
4,10835047.0,6.775700e+06
...,...,...
543,2366855.0,2.183233e+06
544,1020321.0,7.748374e+05
545,902733.0,5.340591e+05
546,696658.0,5.218028e+05
