![image info](https://ineuron.ai/images/ineuron-logo.png)

###  Import Data and Required Packages
####  Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer
warnings.filterwarnings("ignore")

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
import xgboost

#core import for hyperparamter tuning
from sklearn.model_selection import RandomizedSearchCV


%matplotlib inline

#### Import the CSV Data as Pandas DataFrame

In [5]:
df = pd.read_csv('data/google_outliers.csv')

#### Show Top 5 Records

In [6]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver', 'Day', 'Month', 'Year'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Day,Month,Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19.0,10000.0,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,7,1,2018.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14.0,500000.0,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,15,1,2018.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7,5000000.0,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,1,8,2018.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,136881.75,25.0,12498500.0,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,8,6,2018.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8,100000.0,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,20,6,2018.0


#### Dropping columns that do not contribute numerically to the Regression Model

In [36]:
df.drop(columns=['Current Ver','Android Ver','App','Last Updated'],inplace=True)

In [None]:
y=mx+c

#### Encoding categorical values

In [9]:
df['Content Rating'].value_counts()

Everyone           8714
Teen               1208
Mature 17+          499
Everyone 10+        414
Adults only 18+       3
Unrated               2
Name: Content Rating, dtype: int64

In [None]:
one-hot encoding

In [10]:
df=pd.get_dummies(df,columns=['Type','Content Rating'],drop_first=True)

In [11]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Price,Genres,Last Updated,Current Ver,Android Ver,Day,Month,Year,Type_Paid,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,19.0,10000.0,0.0,Art & Design,2018-01-07,1.0.0,4.0.3 and up,7,1,2018.0,0,1,0,0,0,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,14.0,500000.0,0.0,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,15,1,2018.0,0,1,0,0,0,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,8.7,5000000.0,0.0,Art & Design,2018-08-01,1.2.4,4.0.3 and up,1,8,2018.0,0,1,0,0,0,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,136881.75,25.0,12498500.0,0.0,Art & Design,2018-06-08,Varies with device,4.2 and up,8,6,2018.0,0,0,0,0,1,0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,2.8,100000.0,0.0,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,20,6,2018.0,0,1,0,0,0,0


#### Splitting our mathematical feature columns and assigning it to 'X'

In [22]:
X=df.drop(columns=['App','Category','Rating','Genres',"Last Updated","Current Ver","Android Ver","Day","Month","Year"],axis=1)

In [23]:
X.head()

Unnamed: 0,Reviews,Size,Installs,Price,Type_Paid,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,159.0,19.0,10000.0,0.0,0,1,0,0,0,0
1,967.0,14.0,500000.0,0.0,0,1,0,0,0,0
2,87510.0,8.7,5000000.0,0.0,0,1,0,0,0,0
3,136881.75,25.0,12498500.0,0.0,0,0,0,0,1,0
4,967.0,2.8,100000.0,0.0,0,1,0,0,0,0


#### Splitting our target variable 'Rating' and assigning it to 'y'

In [24]:
y=df['Rating']

#### Splitting up our data set into 'train' and 'test'

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3)

#### Scaling the training data(fitting the parameters and transforming the values)

In [26]:
scaler= StandardScaler()

X_train=scaler.fit_transform(X_train)

#transforming the test data.We avoid fitting the values to prevent data leakage!
X_test=scaler.transform(X_test)

In [None]:
###liner regression, ridge regresion, lasso regression

In [None]:
one notebook

## Random Forest Regressor

#### Fitting without hyperparamter tuning

In [17]:
rf=RandomForestRegressor(n_estimators = 10, random_state = 42)
rf.fit(X_train,y_train)

RandomForestRegressor(n_estimators=10, random_state=42)

#### Predictions

In [18]:
pred_rf=rf.predict(X_test)

#### Use the random grid to search for best hyperparameters

In [19]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [20]:
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs =-1,scoring='neg_mean_squared_error')
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, scoring='neg_mean_squared_error',
                   verbose=2)

#### Getting the best params

In [22]:
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

#### Getting the best estimators

In [23]:
rf_random.best_estimator_

RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=1600)

#### Reinitializing the regressor object with the best probable estimators

In [32]:
rf=RandomForestRegressor(max_depth=50, max_features='sqrt', min_samples_leaf=4,
                      min_samples_split=10, n_estimators=800,bootstrap=True)

#### Fitting the random forest regresor on our training data

In [33]:
rf.fit(X_train,y_train)

RandomForestRegressor(max_depth=50, max_features='sqrt', min_samples_leaf=4,
                      min_samples_split=10, n_estimators=800)

####  Use the Random forest's predict method on the test data

In [34]:
predictions_rf = rf.predict(X_test)

## XG Boost Regressor 

#### Creates a xgbRegressor object

In [10]:
regressor_xgb=xgboost.XGBRegressor()

#fitting without hyperparamter tuning
regressor_xgb.fit(X_train,y_train)

#predictions
pred_xgb=regressor_xgb.predict(X_test)

#### Hyperparamter Tuning for xgboost

In [11]:
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]
base_score=[0.25,0.5,0.75,1]

# Define the grid of hyperparameters to search
parameter_grid_xgb = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

#### Set up the random search with 5-fold cross validation

In [12]:

regressor=xgboost.XGBRegressor()

random_cv_xgb = RandomizedSearchCV(estimator=regressor,
            param_distributions=parameter_grid_xgb,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 3,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

#train on the RandomSearchCv object to get best estimators
random_cv_xgb.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categorical=False,
                                          eval_metric=None, gamma=None,
                                          gpu_id=None, grow_policy=None,
                                          importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None, max_bin=None,
                                          m...
                                          reg_alpha=None, reg_lambda=None, ...),
                   n_iter

#### Get the best params

In [13]:
random_cv_xgb.best_params_

{'n_estimators': 900,
 'min_child_weight': 4,
 'max_depth': 2,
 'learning_rate': 0.05,
 'booster': 'gbtree',
 'base_score': 0.25}

#### Reinitializing the regressor object with the best probable estimators

In [14]:
regressor_xgb=xgboost.XGBRegressor(base_score=1, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.15, max_delta_step=0, max_depth=5,
             min_child_weight=2, missing=np.nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

#### Fitting the xgbRegressor on our training data

In [15]:
regressor_xgb.fit(X_train,y_train)

XGBRegressor(base_score=1, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type='gain', interaction_constraints='',
             learning_rate=0.15, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=5, max_leaves=0, min_child_weight=2,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

#### Fetching the predictions on our test data

In [16]:
predictions_xgb=regressor_xgb.predict(X_test)

## Perfomance Evaluation: Random Forest vs XGBOOST  
####  Random Forest (Tuned) and XGB (Tuned)  Error Evaluations

In [28]:

print('Mean Absolute Error\tMAE_XGB:', metrics.mean_absolute_error(y_test, predictions_xgb),'\t\t MAE_RF:',  metrics.mean_absolute_error(y_test, predictions_rf))
print('Mean Squared Error\tMSE_XGB:', metrics.mean_squared_error(y_test, predictions_xgb),'\t\t MSE_RF:',  metrics.mean_squared_error(y_test, predictions_rf))
print('Root Mean Squared Error\tRMSE_XGB:', np.sqrt(metrics.mean_squared_error(y_test, predictions_xgb)),'\t\t RMSE_RF:', np.sqrt(metrics.mean_squared_error(y_test, predictions_rf)))

Mean Absolute Error	MAE_XGB: 0.2993431386032668 		 MAE_RF: 0.29805002079407006
Mean Squared Error	MSE_XGB: 0.15073913971584946 		 MSE_RF: 0.14952480905597093
Root Mean Squared Error	RMSE_XGB: 0.3882513872684159 		 RMSE_RF: 0.3866843791207125


####  Random Forest (Untuned) and XGB (Ununed)  Error Evaluations

In [30]:
print('Mean Absolute Error\tMAE_XGB:', metrics.mean_absolute_error(y_test, pred_xgb),'\t\t MAE_RF:',  metrics.mean_absolute_error(y_test, pred_rf))
print('Mean Squared Error\tMSE_XGB:', metrics.mean_squared_error(y_test, pred_xgb),'\t\t MSE_RF:',  metrics.mean_squared_error(y_test, pred_rf))
print('Root Mean Squared Error\tRMSE_XGB:', np.sqrt(metrics.mean_squared_error(y_test, pred_xgb)),'\t\t RMSE_RF:', np.sqrt(metrics.mean_squared_error(y_test, pred_rf)))

Mean Absolute Error	MAE_XGB: 0.30675862042226476 		 MAE_RF: 0.30557152887941635
Mean Squared Error	MSE_XGB: 0.15981031873303844 		 MSE_RF: 0.1656810612513979
Root Mean Squared Error	RMSE_XGB: 0.3997628281031622 		 RMSE_RF: 0.4070393853810684


#### Conclusions
1. XGB Model performs more or less the same even after tuning the hyperparameters
2. We see a slight improvement in the Random Forest Regressor with hyperparameters tuning