In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, RepeatedStratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

The data is loaded below.  Note that the handwritten digit data is already split into features and target (`digits`, `labels`).

In [None]:
churn = pd.read_csv('../content/telecom_churn.csv')
digits, labels = load_digits(return_X_y=True)

In [None]:
#churn data
churn.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [None]:
churn.isnull().sum()

State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     0
dtype: int64

In [None]:
churn.shape

(3333, 20)

In [None]:
churn.State.value_counts()

WV    106
MN     84
NY     83
AL     80
WI     78
OH     78
OR     78
WY     77
VA     77
CT     74
MI     73
ID     73
VT     73
TX     72
UT     72
IN     71
MD     70
KS     70
NC     68
NJ     68
MT     68
CO     66
NV     66
WA     66
RI     65
MA     65
MS     65
AZ     64
FL     63
MO     63
NM     62
ME     62
ND     62
NE     61
OK     61
DE     61
SC     60
SD     60
KY     59
IL     58
NH     56
AR     55
GA     54
DC     54
HI     53
TN     53
AK     52
LA     51
PA     45
IA     44
CA     34
Name: State, dtype: int64

In [None]:
churn['Area code'].value_counts()

415    1655
510     840
408     838
Name: Area code, dtype: int64

In [None]:
churn['Voice mail plan'].value_counts()

No     2411
Yes     922
Name: Voice mail plan, dtype: int64

In [None]:
churn['International plan'].value_counts()

No     3010
Yes     323
Name: International plan, dtype: int64

In [None]:
churn['Churn'].value_counts()

False    2850
True      483
Name: Churn, dtype: int64

In [None]:
coded=pd.get_dummies(churn)

In [None]:
coded.head()

Unnamed: 0,Account length,Area code,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,...,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,International plan_No,International plan_Yes,Voice mail plan_No,Voice mail plan_Yes
0,128,415,25,265.1,110,45.07,197.4,99,16.78,244.7,...,0,0,0,0,0,0,1,0,0,1
1,107,415,26,161.6,123,27.47,195.5,103,16.62,254.4,...,0,0,0,0,0,0,1,0,0,1
2,137,415,0,243.4,114,41.38,121.2,110,10.3,162.6,...,0,0,0,0,0,0,1,0,1,0
3,84,408,0,299.4,71,50.9,61.9,88,5.26,196.9,...,0,0,0,0,0,0,0,1,1,0
4,75,415,0,166.7,113,28.34,148.3,122,12.61,186.9,...,0,0,0,0,0,0,0,1,1,0


In [None]:
X=coded.drop(columns=['Churn'])
y=coded.Churn
X.head()

Unnamed: 0,Account length,Area code,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,...,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,International plan_No,International plan_Yes,Voice mail plan_No,Voice mail plan_Yes
0,128,415,25,265.1,110,45.07,197.4,99,16.78,244.7,...,0,0,0,0,0,0,1,0,0,1
1,107,415,26,161.6,123,27.47,195.5,103,16.62,254.4,...,0,0,0,0,0,0,1,0,0,1
2,137,415,0,243.4,114,41.38,121.2,110,10.3,162.6,...,0,0,0,0,0,0,1,0,1,0
3,84,408,0,299.4,71,50.9,61.9,88,5.26,196.9,...,0,0,0,0,0,0,0,1,1,0
4,75,415,0,166.7,113,28.34,148.3,122,12.61,186.9,...,0,0,0,0,0,0,0,1,1,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'penalty': ['l1', 'l2'],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

grid_pipeline = GridSearchCV(LogisticRegression(), param_grid)
grid_pipeline.fit(X_train, y_train)

print('Parameters')
print(grid_pipeline.best_params_)
print('')
print('Best Test Score')
print(grid_pipeline.best_score_)
print('')
print('Best Training Score')
print(grid_pipeline.score(X_train,y_train))
print('')
print('Best Estimator')
print(grid_pipeline.best_estimator_)
print('')
print('Avg Fit Time')
print(grid_pipeline.cv_results_['mean_fit_time'].mean())

Parameters
{'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}

Best Test Score
0.8623999411824173

Best Training Score
0.8705529361337334

Best Estimator
LogisticRegression(C=1, solver='newton-cg')

Avg Fit Time
0.1653359249659947


In [None]:
knn = KNeighborsClassifier()
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)

grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', verbose=1)
grid_search = grid.fit(X_train, y_train)

print('Parameters')
print(grid_search.best_params_)
print('')
print('Best Testing Score')
print(grid_search.best_score_)
print('')
print('Best Training Score')
print(grid_search.score(X_train,y_train))
print('')
print('Best Estimator')
print(grid_pipeline.best_estimator_)
print('')
print('Avg Fit Time')
print(grid_search.cv_results_['mean_fit_time'].mean())

Fitting 10 folds for each of 30 candidates, totalling 300 fits
Parameters
{'n_neighbors': 11}

Best Testing Score
0.8757088881552402

Best Training Score
0.8812687526789541

Best Estimator
LogisticRegression(C=1, solver='newton-cg')

Avg Fit Time
0.009410418669382732


In [None]:
params = {'max_depth': [2,3,4,5,6,7,8,9,10],
          'min_samples_split': [2,3,4],
          'criterion': ['gini','entropy'],
          'min_samples_leaf': [2,3,4]
          }
grid = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=10)
grid_search=grid.fit(X_train, y_train)

print('Parameters')
print(grid_search.best_params_)
print('')
print('Best Testing Score')
print(grid_search.best_score_)
print('')
print('Best Training Score')
print(grid_search.score(X_train,y_train))
print('')
print('Best Estimator')
print(grid_search.best_estimator_)
print('')
print('Avg Fit Time')
print(grid_search.cv_results_['mean_fit_time'].mean())

Fitting 10 folds for each of 162 candidates, totalling 1620 fits
Parameters
{'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 3, 'min_samples_split': 2}

Best Testing Score
0.9417171050218259

Best Training Score
0.9661380197171024

Best Estimator
DecisionTreeClassifier(max_depth=7, min_samples_leaf=3, random_state=42)

Avg Fit Time
0.026365057333016105


**TASK 2**: Recognizing Handwritten Digits

Suppose you are tasked with training a model to recognize handwritten digits.  Which of your classifier would you use here and why?  Again, be sure to consider the balance of classes, speed of training, and importance of interpretability.



In [None]:
from sklearn.datasets import load_digits
digits=load_digits()
#print(digits.data.shape)
#plt.imshow(digits.images[0].reshape(8, 8))
#plt.title('This is a handwritten 0.')

In [None]:
classifier = SVC(gamma=0.4, C=100)
x, y = digits.images, digits.target
n_samples = len(digits.images)
x = x.reshape((1, -1))

print("before reshape:" + str(digits.images[0]))
print("After reshape" + str(x[0]))

before reshape:[[ 0.  0.  5. 13.  9.  1.  0.  0.]
 [ 0.  0. 13. 15. 10. 15.  5.  0.]
 [ 0.  3. 15.  2.  0. 11.  8.  0.]
 [ 0.  4. 12.  0.  0.  8.  8.  0.]
 [ 0.  5.  8.  0.  0.  9.  8.  0.]
 [ 0.  4. 11.  0.  1. 12.  7.  0.]
 [ 0.  2. 14.  5. 10. 12.  0.  0.]
 [ 0.  0.  6. 13. 10.  0.  0.  0.]]
After reshape[ 0.  0.  5. ... 12.  1.  0.]


In [None]:
classifier.fit(x[:-2], y[:-2])
###
print('Prediction:', classifier.predict(x[-2]))
###
plt.imshow(digits.images[-2], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()

###
print('Prediction:', classifier.predict(x[-1]))
###
plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()

ValueError: ignored

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,
                                               random_state=77,
                                               stratify=y)

In [None]:
param_grid={'C':[0.1,1,10,100],
            'gamma':[0.0001,0.001,0.1,1],
            'kernel':['rbf','poly']}

# Creating a support vector classifier
svc=SVC(probability=True)

# Creating a model using GridSearchCV with the parameters grid
model=GridSearchCV(svc,param_grid)

In [None]:
model.fit(x_train,y_train)

ValueError: ignored