# Classification

## Data Preprocessing

In [2]:
import pandas as pd 

df = pd.read_csv('/content/churn-bigml-80.csv')
x_cols = list(df.columns[0:-1])
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [3]:
X = df[x_cols]
X = pd.get_dummies(X)
y = df['Churn'].values

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
from sklearn.preprocessing import StandardScaler

# simple preprocessing of the data 
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Modeling 

In [None]:
from sklearn.linear_model import LogisticRegression

# implement basic ML baseline (one per asset)
lr = LogisticRegression()
lr.fit(X_train_scaled,y_train)
y_pred_lr_btc = lr.predict(X_test_scaled)

lr.fit(X_train_scaled,y_train)
lr.score(X_test_scaled,y_test)

0.8408239700374532

In [None]:
# implement more complex baseline (multiple output regression model)
from sklearn.multioutput import MultiOutputClassifier
import numpy as np

# define the direct multioutput model and fit it
mlr = MultiOutputClassifier(LogisticRegression())
lr.fit(X_train_scaled,y_train)
lr.score(X_test,y_test)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


0.14794007490636704

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.900749063670412

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier()
clf.fit(X_train_scaled,y_train)
clf.score(X_test_scaled,y_test)

0.951310861423221

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

clf = HistGradientBoostingClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.947565543071161

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-7, hidden_layer_sizes=(5, 2), random_state=1,max_iter=10000)

clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.8520599250936329

In [None]:
from sklearn import svm

clf = svm.SVC()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.8520599250936329

In [None]:
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(svm.SVC(),max_samples=0.5, max_features=0.5)

bagging.fit(X_train,y_train)
bagging.score(X_test,y_test)

0.8520599250936329

In [None]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train_scaled, y_train )

clf.score(X_test_scaled,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.848314606741573

In [17]:
# Gradient Boosting is the best model, let's evaluate its performance
from sklearn import metrics

y_pred = clf.predict(X_test_scaled)

recall = metrics.recall_score(y_test, y_pred)
pre = metrics.precision_score(y_test, y_pred)
acc = metrics.accuracy_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)

In [19]:
print('The accuracy from model is {}, precision is {}, recall is {}, and fscore is {}.'.format(acc,pre,recall,f1))

The accuracy from model is 0.951310861423221, precision is 0.9206349206349206, recall is 0.7341772151898734, and fscore is 0.8169014084507041.


The recall is calculated as the ratio between the numbers of Positive samples correctly classified as Positive to the total number of Positive samples, though a relatively low recall can cause us to doubt the credibility of the model of finding positive class in data. I personally would recommend weighted average solution for this case


# Regression

## Data Preprocessing

In [20]:
import pandas as pd 

df = pd.read_csv('/content/Employee-Attrition.csv')
x_cols = list(df.columns[0:-1])
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [21]:
X = df[x_cols]
X = pd.get_dummies(X)
y = df['YearsWithCurrManager'].values

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
from sklearn.preprocessing import StandardScaler

# simple preprocessing of the data 
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Modeling 

In [None]:
from sklearn.linear_model import LinearRegression

# implement basic ML baseline (one per asset)
lr = LinearRegression()
lr.fit(X_train_scaled,y_train)
y_pred_lr_btc = lr.predict(X_test_scaled)

lr.fit(X_train_scaled,y_train)
lr.score(X_test_scaled,y_test)

0.6830498821424538

In [None]:
from sklearn import tree

clf = tree.DecisionTreeRegressor()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.3510076000920406

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

clf = GradientBoostingRegressor()
clf.fit(X_train_scaled,y_train)
clf.score(X_test_scaled,y_test)

0.7112962674037933

In [25]:
from sklearn.ensemble import HistGradientBoostingRegressor

clf = HistGradientBoostingRegressor()
clf.fit(X_train_scaled,y_train)
clf.score(X_test_scaled,y_test)

0.7279539710775773

In [None]:
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(solver='lbfgs', alpha=1e-7, hidden_layer_sizes=(5, 1), random_state=1,max_iter=10000)

clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.1082095585066375

In [None]:
from sklearn import svm

clf = svm.SVC()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.21768707482993196

In [None]:
from sklearn.ensemble import BaggingRegressor
bagging = BaggingRegressor(svm.SVC(),max_samples=0.5, max_features=0.5)

bagging.fit(X_train,y_train)
bagging.score(X_test,y_test)

-0.26766379559570597

In [None]:
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train_scaled, y_train )

clf.score(X_test_scaled,y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.5913638931771049

In [29]:
# Hence hist gradient boosting scores the best
from sklearn import metrics

y_pred = clf.predict(X_test_scaled)

In [36]:
import numpy as np
import math 

MSE = np.square(np.subtract(y_test,y_pred)).mean() 
MAE = np.abs(np.subtract(y_test,y_pred)).mean() 

RMSE = math.sqrt(MSE)

print("Root Mean Square Error is {}, Mean Square Error is {}, and Mean Absolute error is {}.".format(RMSE,MSE,MAE))

Root Mean Square Error is 1.9323930815966122, Mean Square Error is 3.7341430218024514, and Mean Absolute error is 1.2454090970971952.
