## YouTube Trending Project
* ### Machine Learning Models

### Table of Contents:
* 1.Exploratory Data Analysis
* 2.Data Cleaning
* 3.Modeling
    * 3.1 Predicting Likes
        * 3.1.1 Pre-processing Data
            * 3.1.1.1 Train-Test Split (80:20)
            * 3.1.1.2 Initializing Pre-processing Pipeline
        * 3.1.2 Hyperparameter Tuning (Gridsearch)
        * 3.1.3 Regressors
            * 3.1.3.1 Linear Regression
            * 3.1.3.2 Random Forest
            * 3.1.3.3 XGBoost
        * 3.1.4 Random Forest
            * 3.1.4.1 Feature Importance
        * 3.1.5 Likes Evaluation
    * 3.2 Predicting Views
        * 3.2.1 Pre-processing Data
            * 3.2.1.1 Train-Test Split (80:20)
            * 3.2.1.2 Initializing Pre-processing Pipeline
        * 3.2.2 Hyperparameter Tuning (Gridsearch)
        * 3.2.3 Regressors
            * 3.2.3.1 Linear Regression
            * 3.2.3.2 Random Forest
            * 3.2.3.3 XGBoost
        * 3.2.4 Random Forest
            * 3.2.4.1 Feature Importance
        * 3.2.5 Views Evaluation
    * 3.3 Predicting Comment Count
        * 3.3.1 Pre-processing Data
            * 3.3.1.1 Train-Test Split (80:20)
            * 3.3.1.2 Initializing Pre-processing Pipeline
        * 3.3.2 Hyperparameter Tuning (Gridsearch)
        * 3.3.3 Regressors
            * 3.3.3.1 Linear Regression
            * 3.3.3.2 Decision Trees
            * 3.3.3.3 Random Forest
        * 3.3.4 Random Forest
            * 3.1.4.1 Feature Importance

### 3. Machine Learning Models
##### Loading Data and Libraries

In [None]:
import helpers
import pandas as pd
import numpy as np
import seaborn as sns


# Encoding and Data Split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Modeling
from sklearn import metrics
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Tuning
import optuna
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Reading the stitched data
df = helpers.load_df("Data/Curated_US_Data.csv")

df.head()

Unnamed: 0,categoryId,likeRatio,likes_log,views_log,dislikes_log,comment_log,days_lapse,durationHr,durationMin,durationSec,titleLength,tagCount
0,25,0.876818,11.457423,15.708863,8.733755,10.990247,0.0,1,59,15,66,12
1,10,0.985548,14.211013,15.832615,9.288227,11.853311,0.0,0,2,58,42,22
2,10,0.974122,11.938376,14.220534,7.603898,9.306832,1440.0,0,3,0,42,26
3,22,0.976673,13.299495,15.487011,8.859931,10.423709,2880.0,0,5,55,35,0
4,10,0.984114,11.315194,13.667111,6.487684,8.40268,1440.0,0,2,59,47,22


### 3.1 Predicting Likes
#### 3.1.1 Preprocessing Data
##### 3.1.1.1 Train-Test Split (80:20)
Splitting the data into train and test sets in a 80:20 ratio

In [None]:
X = df.drop(columns=['likes_log'])
y = df['likes_log']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

##### 3.1.1.2 Initializing Preprocessing Pipeline
Scaling numercal data and encoding categorical data

In [None]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).drop(['durationHr','durationMin','durationSec', 'categoryId'],axis=1).columns
categorical_features = list(X.select_dtypes(include=['object']).columns) + ['durationHr','durationMin','durationSec', 'categoryId']

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numeric_features),
        ('categorical', OneHotEncoder(handle_unknown = "ignore"), categorical_features)])

y

0       11.457423
1       14.211013
2       11.938376
3       13.299495
4       11.315194
          ...    
2731    12.388952
2732    10.343966
2733    13.291704
2734    10.807706
2735     8.847791
Name: likes_log, Length: 2736, dtype: float64

In [None]:
print('Numeric Features:', numeric_features)
print('Categorical Features:', categorical_features)

Numeric Features: Index(['likeRatio', 'views_log', 'dislikes_log', 'comment_log', 'days_lapse',
       'titleLength', 'tagCount'],
      dtype='object')
Categorical Features: ['durationHr', 'durationMin', 'durationSec', 'categoryId']


#### 3.1.2 Hyperparameter Tuning (Optuna)
Using bayesian hyperparameter optimization to find optimal parameters

In [None]:
# Define Objective Function to be Maximized
def rfObjective(trial):
    # Define Random Forest Parameters
    rfParams={
        'n_estimators' : trial.suggest_int('n_estimators', 100, 500),
        'max_depth' : trial.suggest_int('max_depth', 1, 50),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1,15),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2,15)
    }
    
    # Establish Random Forest Regressor Pipeline
    rfPipe = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(
        **rfParams
    ))])
    
    for step in range(100):
        rfPipe.fit(X_train,y_train)
        
        # Report Intermediate Objective Value
        intermediate_value = rfPipe.score(X_test, y_test)
        trial.report(intermediate_value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        return intermediate_value

# Running the Study
rfStudy = optuna.create_study(direction='maximize')
rfStudy.optimize(rfObjective, n_trials=100)

# Calculating the pruned and completed trials
pruned_trials = [t for t in rfStudy.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in rfStudy.trials if t.state == optuna.trial.TrialState.COMPLETE]

rfTrial = rfStudy.best_trial

print('Number of finished trials: ', len(rfStudy.trials))
print('Number of pruned trials: ', len(pruned_trials))
print('Number of complete trials: ', len(complete_trials))

print('Accuracy: {}'.format(rfTrial.value))
print("Best hyperparameters: {}".format(rfTrial.params))

[32m[I 2021-01-23 15:02:50,477][0m A new study created in memory with name: no-name-cd6a481b-f6cb-44e1-9ca6-9783387e3ef5[0m
[32m[I 2021-01-23 15:03:01,855][0m Trial 0 finished with value: 0.9846358373221034 and parameters: {'n_estimators': 307, 'max_depth': 22, 'min_samples_leaf': 1, 'min_samples_split': 11}. Best is trial 0 with value: 0.9846358373221034.[0m
[32m[I 2021-01-23 15:03:06,833][0m Trial 1 finished with value: 0.9719301062370006 and parameters: {'n_estimators': 213, 'max_depth': 17, 'min_samples_leaf': 10, 'min_samples_split': 14}. Best is trial 0 with value: 0.9846358373221034.[0m
[32m[I 2021-01-23 15:03:11,831][0m Trial 2 finished with value: 0.9626624921743308 and parameters: {'n_estimators': 274, 'max_depth': 37, 'min_samples_leaf': 13, 'min_samples_split': 6}. Best is trial 0 with value: 0.9846358373221034.[0m
[32m[I 2021-01-23 15:03:22,055][0m Trial 3 finished with value: 0.973738916642255 and parameters: {'n_estimators': 388, 'max_depth': 23, 'min_sampl

In [None]:
# Define Objective Function to be Maximized
def xgbObjective(trial):
    # Define XGBoost Parameters
    xgbParams = {
        'n_estimators' : trial.suggest_int('n_estimators', 100,500),
        'max_depth' : trial.suggest_int('max_depth', 1, 20),
        'eta' : trial.suggest_uniform('eta', 0.01, 1), # learning_rate
        'subsample': trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1),
        'gamma': trial.suggest_int('gamma', 0, 10), # min_split_loss
        'min_child_weight' : trial.suggest_uniform('min_child_weight', 0.1, 1.0)
    }

    # Establish XGBoost Regressor Pipeline
    xgbPipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', xgb.XGBRegressor(
        **xgbParams
    ))])
    
    for step in range(100):
        xgbPipe.fit(X_train,y_train)
        
        # Report Intermediate Objective Value
        intermediate_value = xgbPipe.score(X_test, y_test)
        trial.report(intermediate_value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        return intermediate_value

# Running the Study
xgbStudy = optuna.create_study(direction='maximize')
xgbStudy.optimize(xgbObjective, n_trials=100)

# Calculating the pruned and completed trials
pruned_trials = [t for t in xgbStudy.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in xgbStudy.trials if t.state == optuna.trial.TrialState.COMPLETE]

xgbTrial = xgbStudy.best_trial

print('Number of finished trials: ', len(xgbStudy.trials))
print('Number of pruned trials: ', len(pruned_trials))
print('Number of complete trials: ', len(complete_trials))

print('Accuracy: {}'.format(xgbTrial.value))
print("Best hyperparameters: {}".format(xgbTrial.params))
    

[32m[I 2021-01-23 15:27:12,155][0m A new study created in memory with name: no-name-789b4dd5-9259-46c4-b008-cb3519268446[0m
[32m[I 2021-01-23 15:29:47,659][0m Trial 0 finished with value: 0.9529020302988274 and parameters: {'n_estimators': 357, 'max_depth': 10, 'eta': 0.13309898440051457, 'subsample': 0.17296256102369129, 'colsample_bytree': 0.625431395630446, 'gamma': 4, 'min_child_weight': 0.859843701854453}. Best is trial 0 with value: 0.9529020302988274.[0m
[32m[I 2021-01-23 15:33:05,171][0m Trial 1 finished with value: 0.9638550243800642 and parameters: {'n_estimators': 326, 'max_depth': 12, 'eta': 0.235928184298863, 'subsample': 0.3603103051848987, 'colsample_bytree': 0.6711809034166738, 'gamma': 3, 'min_child_weight': 0.34191902692138565}. Best is trial 1 with value: 0.9638550243800642.[0m
[32m[I 2021-01-23 15:33:11,697][0m Trial 2 finished with value: 0.9099770885242903 and parameters: {'n_estimators': 197, 'max_depth': 18, 'eta': 0.35213173792673985, 'subsample': 0.

#### 3.1.3 Regressors
* ##### 3.1.3.1 Linear Regression
* ##### 3.1.3.2 Random Forest
* ##### 3.1.3.3 XGBoost


In [None]:
regressors = [
        LinearRegression(),
        RandomForestRegressor(**rfTrial.params),
        xgb.XGBRegressor(**xgbTrial.params),
    ]

for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    
    # y_pred = pipe.predict(X_test)

    # d1 = {'True Labels': y_test, 'Predicted Labels': y_pred}
    # SK = pd.DataFrame(data = d1)
    # print(SK)

    print("Model Score: %.3f" % pipe.score(X_test, y_test))

    mae = metrics.mean_absolute_error(y_test, pipe.predict(X_test))
    mse = metrics.mean_squared_error(y_test,pipe.predict(X_test))
    rmse = np.sqrt(metrics.mean_squared_error(y_test, pipe.predict(X_test)))
    print("mae: ", mae)
    print("mse: ", mse)
    print("rmse: ", rmse, "\n")

    
    # lm1 = sns.lmplot(x="True Labels", y="Predicted Labels", data = SK, size = 10)
    # fig1 = lm1.fig 
    # fig1.suptitle("Sklearn ", fontsize=18)
    # sns.set(font_scale = 1.5)


LinearRegression()
Model Score: 0.930
mae:  0.307648610886282
mse:  0.16176731542590703
rmse:  0.40220307734514793 

RandomForestRegressor(max_depth=50, n_estimators=274)
Model Score: 0.989
mae:  0.09461742502033227
mse:  0.026239461729822704
rmse:  0.16198599238768366 

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9866952798020608,
             eta=0.07462264455597645, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.074622646, max_delta_step=0, max_depth=7,
             min_child_weight=0.2100895883715349, missing=nan,
             monotone_constraints='()', n_estimators=346, n_jobs=4,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.1702457663739264,
             tree_method='exact', validate_parameters=1, verbosity=None)
Model Score: 0.996
mae:  0.07228691574125996
mse:  0

#### 3.1.4 Random Forest Regressor

In [None]:
reg = RandomForestRegressor(**{'n_estimators': 456, 'max_depth': 16, 'min_samples_leaf': 1, 'min_samples_split': 3}, oob_score=True)

pipe = Pipeline(steps=[('preprocessor', preprocessor),
              ('regressor', reg)])
pipe.fit(X_train, y_train)   
print(reg)

print("Model Train Score: %.3f" % pipe.score(X_train, y_train))
print("Model OOB Score: %.3f" % reg.oob_score_)
print("Model Test Score: %.3f" % pipe.score(X_test, y_test))

RandomForestRegressor(max_depth=16, min_samples_split=3, n_estimators=456,
                      oob_score=True)
Model Train Score: 0.998
Model OOB Score: 0.986
Model Test Score: 0.988


##### 3.1.4.1 Feature Importance

In [None]:
pd.DataFrame(zip(X.columns,reg.feature_importances_),columns=['feature','importance']).sort_values(by='importance',ascending=False)


Unnamed: 0,feature,importance
3,dislikes_log,0.5883689
1,likeRatio,0.1559548
0,categoryId,0.1400833
2,views_log,0.09250421
5,days_lapse,0.003694765
6,durationHr,0.002194229
4,comment_log,0.00114896
8,durationSec,0.0004434084
7,durationMin,0.0003343907
9,titleLength,1.275014e-05


#### 3.1.5 Likes Evaluation

In [None]:
eval = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb.XGBRegressor(**xgbTrial.params))])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test) 

mae = metrics.mean_absolute_error(y_test,y_pred)
mse = metrics.mean_squared_error(y_test,y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
r2 = metrics.r2_score(y_test, y_pred)

print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

mae:  0.09448494373046938
mse:  0.02556721076235017
rmse:  0.15989750080082604
r2:  0.9889741601004047


In [None]:
df = pd.DataFrame(data=list(zip(list(y_test), list(y_pred))),columns=['actual','predicted'])

#Unlog Values
for col in df.columns:
    df[col] = df[col].apply(lambda x: np.e**x)

df

Unnamed: 0,actual,predicted
0,16246.0,1.810635e+04
1,477871.0,4.191451e+05
2,162807.0,1.762194e+05
3,1267051.0,1.072060e+06
4,199667.0,2.491322e+05
...,...,...
543,20055.0,2.826144e+04
544,51067.0,6.366254e+04
545,6624.0,6.481463e+03
546,23598.0,2.436071e+04


### 3.2 Predicting Views
#### 3.2.1 Preprocessing Data
##### 3.2.1.1 Train-Test Split (80:20)
Splitting the data into train and test sets in a 80:20 ratio

In [None]:
df = helpers.load_df("Data/Curated_US_Data.csv")

X = df.drop(columns=['views_log'])
y = df['views_log']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

##### 3.2.1.2 Initializing Preprocessing Pipeline
Scaling numercal data and encoding categorical data

In [None]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).drop(['durationHr','durationMin','durationSec', 'categoryId'],axis=1).columns
categorical_features = list(X.select_dtypes(include=['object']).columns) + ['durationHr','durationMin','durationSec', 'categoryId']

preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numeric_features),
        ('categorical', OneHotEncoder(handle_unknown = "ignore"), categorical_features)])

y

0       15.708863
1       15.832615
2       14.220534
3       15.487011
4       13.667111
          ...    
2731    14.933087
2732    12.728645
2733    16.079583
2734    13.822260
2735    12.871561
Name: views_log, Length: 2736, dtype: float64

In [None]:
print('Numeric Features:', numeric_features)
print('Categorical Features:', categorical_features)

Numeric Features: Index(['likeRatio', 'likes_log', 'dislikes_log', 'comment_log', 'days_lapse',
       'titleLength', 'tagCount'],
      dtype='object')
Categorical Features: ['durationHr', 'durationMin', 'durationSec', 'categoryId']


#### 3.2.2 Hyperparameter Tuning (Optuna)
Using bayesian hyperparameter optimization to find optimal parameters

In [None]:
# Define Objective Function to be Maximized
def rfObjective(trial):
    # Define Random Forest Parameters
    rfParams={
        'n_estimators' : trial.suggest_int('n_estimators', 100, 500),
        'max_depth' : trial.suggest_int('max_depth', 1, 50),
        'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1,15),
        'min_samples_split' : trial.suggest_int('min_samples_split', 2,15)
    }
    
    # Establish Random Forest Regressor Pipeline
    rfPipe = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(
        **rfParams
    ))])
    
    for step in range(100):
        rfPipe.fit(X_train,y_train)
        
        # Report Intermediate Objective Value
        intermediate_value = rfPipe.score(X_test, y_test)
        trial.report(intermediate_value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        return intermediate_value

# Running the Study
rfStudy = optuna.create_study(direction='maximize')
rfStudy.optimize(rfObjective, n_trials=100)

# Calculating the pruned and completed trials
pruned_trials = [t for t in rfStudy.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in rfStudy.trials if t.state == optuna.trial.TrialState.COMPLETE]

rfTrial = rfStudy.best_trial

print('Number of finished trials: ', len(rfStudy.trials))
print('Number of pruned trials: ', len(pruned_trials))
print('Number of complete trials: ', len(complete_trials))

print('Accuracy: {}'.format(rfTrial.value))
print("Best hyperparameters: {}".format(rfTrial.params))

[32m[I 2021-01-23 15:37:33,475][0m A new study created in memory with name: no-name-da086f27-a7d9-4e9e-8502-4c89492c7471[0m
[32m[I 2021-01-23 15:37:39,767][0m Trial 0 finished with value: 0.8786052152835335 and parameters: {'n_estimators': 401, 'max_depth': 12, 'min_samples_leaf': 13, 'min_samples_split': 14}. Best is trial 0 with value: 0.8786052152835335.[0m
[32m[I 2021-01-23 15:37:47,660][0m Trial 1 finished with value: 0.8895085868053441 and parameters: {'n_estimators': 405, 'max_depth': 11, 'min_samples_leaf': 9, 'min_samples_split': 4}. Best is trial 1 with value: 0.8895085868053441.[0m
[32m[I 2021-01-23 15:37:54,028][0m Trial 2 finished with value: 0.9006515621039455 and parameters: {'n_estimators': 261, 'max_depth': 39, 'min_samples_leaf': 5, 'min_samples_split': 14}. Best is trial 2 with value: 0.9006515621039455.[0m
[32m[I 2021-01-23 15:37:56,707][0m Trial 3 finished with value: 0.8837690175553792 and parameters: {'n_estimators': 152, 'max_depth': 22, 'min_sampl

In [None]:
# Define Objective Function to be Maximized
def xgbObjective(trial):
    # Define XGBoost Parameters
    xgbParams = {
        'n_estimators' : trial.suggest_int('n_estimators', 100,500),
        'max_depth' : trial.suggest_int('max_depth', 1, 20),
        'eta' : trial.suggest_uniform('eta', 0.01, 1), # learning_rate
        'subsample': trial.suggest_uniform('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1),
        'gamma': trial.suggest_int('gamma', 0, 10), # min_split_loss
        'min_child_weight' : trial.suggest_uniform('min_child_weight', 0.1, 1.0)
    }

    # Establish XGBoost Regressor Pipeline
    xgbPipe = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', xgb.XGBRegressor(
        **xgbParams
    ))])
    
    for step in range(100):
        xgbPipe.fit(X_train,y_train)
        
        # Report Intermediate Objective Value
        intermediate_value = xgbPipe.score(X_test, y_test)
        trial.report(intermediate_value, step)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        return intermediate_value

# Running the Study
xgbStudy = optuna.create_study(direction='maximize')
xgbStudy.optimize(xgbObjective, n_trials=100)

# Calculating the pruned and completed trials
pruned_trials = [t for t in xgbStudy.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in xgbStudy.trials if t.state == optuna.trial.TrialState.COMPLETE]

xgbTrial = xgbStudy.best_trial

print('Number of finished trials: ', len(xgbStudy.trials))
print('Number of pruned trials: ', len(pruned_trials))
print('Number of complete trials: ', len(complete_trials))

print('Accuracy: {}'.format(xgbTrial.value))
print("Best hyperparameters: {}".format(xgbTrial.params))
    

[32m[I 2021-01-23 16:00:17,388][0m A new study created in memory with name: no-name-11a1ee32-a567-4f7b-8430-df8ab42a2c85[0m
[32m[I 2021-01-23 16:00:19,047][0m Trial 0 finished with value: 0.7708551723534607 and parameters: {'n_estimators': 293, 'max_depth': 14, 'eta': 0.7210231027900509, 'subsample': 0.3643059197932166, 'colsample_bytree': 0.2368432676573281, 'gamma': 3, 'min_child_weight': 0.6579351182020977}. Best is trial 0 with value: 0.7708551723534607.[0m
[32m[I 2021-01-23 16:00:19,615][0m Trial 1 finished with value: 0.8356468561201909 and parameters: {'n_estimators': 479, 'max_depth': 1, 'eta': 0.8241316885713345, 'subsample': 0.9188848679940332, 'colsample_bytree': 0.44737400606018396, 'gamma': 7, 'min_child_weight': 0.7474909694891879}. Best is trial 1 with value: 0.8356468561201909.[0m
[32m[I 2021-01-23 16:00:22,831][0m Trial 2 finished with value: 0.8915907782958484 and parameters: {'n_estimators': 232, 'max_depth': 18, 'eta': 0.3985880873105458, 'subsample': 0.4

#### 3.2.3 Regressors
* ##### 3.2.3.1 Linear Regression
* ##### 3.2.3.2 Random Forest
* ##### 3.2.3.3 XGBoost


In [None]:
regressors = [
        LinearRegression(),
        RandomForestRegressor(**rfTrial.params),
        xgb.XGBRegressor(**xgbTrial.params),
    ]

for regressor in regressors:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', regressor)])
    pipe.fit(X_train, y_train)   
    print(regressor)
    
    # y_pred = pipe.predict(X_test)

    # d1 = {'True Labels': y_test, 'Predicted Labels': y_pred}
    # SK = pd.DataFrame(data = d1)
    # print(SK)

    print("Model Score: %.3f" % pipe.score(X_test, y_test))

    mae = metrics.mean_absolute_error(y_test, pipe.predict(X_test))
    mse = metrics.mean_squared_error(y_test,pipe.predict(X_test))
    rmse = np.sqrt(metrics.mean_squared_error(y_test, pipe.predict(X_test)))
    print("mae: ", mae)
    print("mse: ", mse)
    print("rmse: ", rmse, "\n")

    
    # lm1 = sns.lmplot(x="True Labels", y="Predicted Labels", data = SK, size = 10)
    # fig1 = lm1.fig 
    # fig1.suptitle("Sklearn ", fontsize=18)
    # sns.set(font_scale = 1.5)


LinearRegression()
Model Score: 0.879
mae:  0.29447569332894336
mse:  0.15599693097797418
rmse:  0.39496446799424145 

RandomForestRegressor(max_depth=44, n_estimators=404)
Model Score: 0.932
mae:  0.20357572327341278
mse:  0.08797270984160414
rmse:  0.29660193836454296 

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.976991139780512,
             eta=0.1613191848337776, gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.161319181,
             max_delta_step=0, max_depth=12,
             min_child_weight=0.9797499715072371, missing=nan,
             monotone_constraints='()', n_estimators=493, n_jobs=4,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=0.7313280457980748,
             tree_method='exact', validate_parameters=1, verbosity=None)
Model Score: 0.955
mae:  0.15436871229449495
mse:  0

#### 3.2.4 Random Forest Regressor

In [None]:
reg = RandomForestRegressor(**rfTrial.params, oob_score=True)

pipe = Pipeline(steps=[('preprocessor', preprocessor),
              ('regressor', reg)])
pipe.fit(X_train, y_train)   
print(reg)

print("Model Train Score: %.3f" % pipe.score(X_train, y_train))
print("Model OOB Score: %.3f" % reg.oob_score_)
print("Model Test Score: %.3f" % pipe.score(X_test, y_test))

RandomForestRegressor(max_depth=44, n_estimators=404, oob_score=True)
Model Train Score: 0.990
Model OOB Score: 0.925
Model Test Score: 0.933


##### 3.2.4.1 Feature Importance

In [None]:
pd.DataFrame(zip(X.columns,reg.feature_importances_),columns=['feature','importance']).sort_values(by='importance',ascending=False)


Unnamed: 0,feature,importance
2,likes_log,0.669329
1,likeRatio,0.192385
3,dislikes_log,0.017139
5,days_lapse,0.015549
0,categoryId,0.014636
6,durationHr,0.013991
4,comment_log,0.010549
9,titleLength,0.000459
7,durationMin,0.00035
8,durationSec,0.000189


#### 3.2.5 Views Evaluation

In [None]:
eval = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb.XGBRegressor(**xgbTrial.params))])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test) 

mae = metrics.mean_absolute_error(y_test,y_pred)
mse = metrics.mean_squared_error(y_test,y_pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
r2 = metrics.r2_score(y_test, y_pred)

print("mae: ", mae)
print("mse: ", mse)
print("rmse: ", rmse)
print("r2: ", r2)

mae:  0.20313152622248157
mse:  0.0878991672254876
rmse:  0.2964779371647874
r2:  0.9318803516434898


In [None]:
df = pd.DataFrame(data=list(zip(list(y_test), list(y_pred))),columns=['actual','predicted'])

#Unlog Values
for col in df.columns:
    df[col] = df[col].apply(lambda x: np.e**x)

df

Unnamed: 0,actual,predicted
0,1138317.0,1.554690e+06
1,6466733.0,6.153997e+06
2,3670365.0,3.772248e+06
3,10925330.0,1.354209e+07
4,10835047.0,6.629584e+06
...,...,...
543,2366855.0,2.135146e+06
544,1020321.0,7.641125e+05
545,902733.0,5.566010e+05
546,696658.0,5.336883e+05
