In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split as tts, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
import os

In [2]:
scaler = StandardScaler()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data = pd.read_csv('drive/MyDrive/Colab Notebooks/data/telecom_churn.csv')

In [5]:
X = data.loc[:, 'AccountWeeks':].values
y = data['Churn'].values

In [None]:
# here is our pipeline for grid search
rfr_pipe = Pipeline([('zscores', StandardScaler()),
                     ('rfr', RandomForestRegressor())])

In [None]:
# now we specify which parameters we want to consider
params = [{'rfr__n_estimators': [65,75,85],
         'rfr__max_depth': [5,6,7],
         'rfr__min_samples_leaf': [2],
         'rfr__min_samples_split': [2]}]

In [None]:
# finally we perform the grid search
gs_rfr = GridSearchCV(rfr_pipe,
                      param_grid=params,
                      scoring='neg_mean_squared_error',
                      cv=5)
gs_rfr.fit(X, y)
gs_rfr.best_params_

{'rfr__max_depth': 6,
 'rfr__min_samples_leaf': 2,
 'rfr__min_samples_split': 2,
 'rfr__n_estimators': 65}

In [None]:
# and we look at our score
gs_rfr.score(X,y)

-0.04114885296727684

In [18]:
# here is our pipeline for grid search
gbr_pipe = Pipeline([('zscores', StandardScaler()),
                     ('rfr', GradientBoostingRegressor())])

In [27]:
# now we specify which parameters we want to consider
params = [{'rfr__n_estimators': [55],
         'rfr__max_depth': [4],
         'rfr__min_samples_leaf': [3],
         'rfr__min_samples_split': [3]}]

In [28]:
# finally we perform the grid search
gs_gbr = GridSearchCV(gbr_pipe,
                      param_grid=params,
                      scoring='neg_mean_squared_error',
                      cv=5)
gs_gbr.fit(X, y)
gs_gbr.best_params_

{'rfr__max_depth': 4,
 'rfr__min_samples_leaf': 3,
 'rfr__min_samples_split': 3,
 'rfr__n_estimators': 55}

In [8]:
# here is our pipeline for grid search
rfc_pipe = Pipeline([('zscores', StandardScaler()),
                     ('rfr', RandomForestClassifier())])

In [13]:
# now we specify which parameters we want to consider
params = [{'rfr__n_estimators': [90,95,100],
         'rfr__max_depth': [7],
         'rfr__min_samples_leaf': [2],
         'rfr__min_samples_split': [2]}]

In [14]:
# finally we perform the grid search
gs_rfc = GridSearchCV(rfc_pipe,
                      param_grid=params,
                      scoring='neg_mean_squared_error',
                      cv=5)
gs_rfc.fit(X, y)
gs_rfc.best_params_

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/loc

{'rfr__max_depth': 7,
 'rfr__min_samples_leaf': 2,
 'rfr__min_samples_split': 2,
 'rfr__n_estimators': 95}

In [18]:
# here is our pipeline for grid search
gbc_pipe = Pipeline([('zscores', StandardScaler()),
                     ('rfr', GradientBoostingClassifier())])

In [27]:
# now we specify which parameters we want to consider
params = [{'rfr__n_estimators': [70],
         'rfr__max_depth':[3],
         'rfr__min_samples_leaf': [5,6],
         'rfr__min_samples_split': [2]}]

In [28]:
# finally we perform the grid search
gs_gbc = GridSearchCV(gbc_pipe,
                      param_grid=params,
                      scoring='neg_mean_squared_error',
                      cv=5)
gs_gbc.fit(X, y)
gs_gbc.best_params_

{'rfr__max_depth': 3,
 'rfr__min_samples_leaf': 5,
 'rfr__min_samples_split': 2,
 'rfr__n_estimators': 70}

In [13]:
mses = []
acc = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

model = RandomForestRegressor(max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=65)

for idxtrain, idxtest in kf.split(X):
  xtrain = X[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = X[idxtest]
  xtrain = scaler.fit_transform(xtrain)
  xtest = scaler.transform(xtest)

  model.fit(xtrain,ytrain)
  yhat = model.predict(xtest)

  mses.append(mse(ytest,yhat))
  acc.append(100*(1 - (np.sum(np.abs(np.around(yhat)-ytest))/len(yhat))))

print("The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : "+str(np.mean(mses)))
print("Average accuracy:" + str(np.mean(acc)) + '%')
for i in acc:
  print(str(i) + '%')

The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : 0.07052201902501304
Average accuracy:92.94779809749869%
95.80838323353294%
93.41317365269461%
94.61077844311377%
92.1921921921922%
93.69369369369369%
90.69069069069069%
93.3933933933934%
93.69369369369369%
92.1921921921922%
89.7897897897898%


In [29]:
mses = []
acc = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

model = GradientBoostingRegressor(max_depth=4, min_samples_leaf=3, min_samples_split=3, n_estimators=55)

for idxtrain, idxtest in kf.split(X):
  xtrain = X[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = X[idxtest]
  xtrain = scaler.fit_transform(xtrain)
  xtest = scaler.transform(xtest)

  model.fit(xtrain,ytrain)
  yhat = model.predict(xtest)

  mses.append(mse(ytest,yhat))
  acc.append(100*(1 - (np.sum(np.abs(np.around(yhat)-ytest))/len(yhat))))

print("The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : "+str(np.mean(mses)))
print("Average accuracy:" + str(np.mean(acc)) + '%')
for i in acc:
  print(str(i) + '%')

The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : 0.05377227528040629
Average accuracy:93.36758914603226%
96.10778443113772%
94.311377245509%
95.50898203592814%
92.7927927927928%
94.29429429429429%
91.89189189189189%
92.49249249249249%
93.09309309309309%
93.09309309309309%
90.09009009009009%


In [30]:
def boosted(x, y, xnew):
  model1 = RandomForestClassifier(max_depth=7, min_samples_leaf=2, min_samples_split=2, n_estimators=95)
  model1.fit(x,y)
  residuals1 = y - model1.predict(x)
  model2 = GradientBoostingClassifier(max_depth=3, min_samples_leaf=5, min_samples_split=2, n_estimators=70)
  model2.fit(x,residuals1)
  output = model1.predict(xnew) + model2.predict(xnew)
  return output 

In [31]:
mses = []
acc = []
kf = KFold(n_splits=10,shuffle=True)

for idxtrain, idxtest in kf.split(X):
  xtrain = X[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = X[idxtest]
  xtrain = scaler.fit_transform(xtrain)
  xtest = scaler.transform(xtest)

  yhat = boosted(xtrain, ytrain, xtest)

  mses.append(mse(ytest,yhat))
  acc.append(100*(1 - (np.sum(np.abs(np.around(yhat)-ytest))/len(yhat))))

print("The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : "+str(np.mean(mses)))
print("Average accuracy:" + str(np.mean(acc)) + '%')
for i in acc:
  print(str(i) + '%')

The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : 0.06151031270791749
Average accuracy:93.84896872920825%
92.21556886227545%
94.91017964071857%
95.80838323353294%
94.5945945945946%
93.3933933933934%
92.1921921921922%
93.09309309309309%
93.09309309309309%
96.09609609609609%
93.09309309309309%


In [15]:
mses = []
acc = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

model = RandomForestClassifier(max_depth=7, min_samples_leaf=2, min_samples_split=2, n_estimators=95)

for idxtrain, idxtest in kf.split(X):
  xtrain = X[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = X[idxtest]
  xtrain = scaler.fit_transform(xtrain)
  xtest = scaler.transform(xtest)

  model.fit(xtrain,ytrain)
  yhat = model.predict(xtest)

  mses.append(mse(ytest,yhat))
  acc.append(100*(1 - (np.sum(np.abs(np.around(yhat)-ytest))/len(yhat))))

print("The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : "+str(np.mean(mses)))
print("Average accuracy:" + str(np.mean(acc)) + '%')
for i in acc:
  print(str(i) + '%')

The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : 0.062120803438168705
Average accuracy:93.78791965618313%
95.80838323353294%
94.91017964071857%
95.50898203592814%
93.3933933933934%
94.5945945945946%
91.89189189189189%
93.09309309309309%
94.8948948948949%
93.3933933933934%
90.39039039039038%


In [29]:
mses = []
acc = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

model = GradientBoostingClassifier(max_depth=3, min_samples_leaf=5, min_samples_split=2, n_estimators=70)

for idxtrain, idxtest in kf.split(X):
  xtrain = X[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = X[idxtest]
  xtrain = scaler.fit_transform(xtrain)
  xtest = scaler.transform(xtest)

  model.fit(xtrain,ytrain)
  yhat = model.predict(xtest)

  mses.append(mse(ytest,yhat))
  acc.append(100*(1 - (np.sum(np.abs(np.around(yhat)-ytest))/len(yhat))))

print("The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : "+str(np.mean(mses)))
print("Average accuracy:" + str(np.mean(acc)) + '%')
for i in acc:
  print(str(i) + '%')

The new Cross-validated Mean Squared Error for Gramfort's Locally Weighted Regression is : 0.06152020283756811
Average accuracy:93.84797971624317%
96.7065868263473%
94.311377245509%
95.20958083832335%
93.993993993994%
94.29429429429429%
92.7927927927928%
93.09309309309309%
93.993993993994%
93.3933933933934%
90.69069069069069%
