In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
%matplotlib inline

In [2]:
df_col= pd.read_csv('College.csv')

In [3]:
df_col.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [4]:
#Label Encoding refers to converting the labels into a numeric form so as to convert them into the machine-readable form.
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
label=LabelEncoder()
feature=df_col.drop('Private', axis=1)
target=df_col['Private']
target=label.fit_transform(target)
x_train,x_test,y_train,y_test= train_test_split(feature, target, random_state=1, test_size=0.2)

In [15]:
from sklearn.svm import LinearSVC, SVC
linear=LinearSVC(max_iter=1000)
linear.fit(x_train,y_train)
y_pred=linear.predict(x_test)
metrics.accuracy_score(y_test,y_pred)



0.8910256410256411

In [16]:
metrics.confusion_matrix(y_test,y_pred)

array([[ 23,  16],
       [  1, 116]], dtype=int64)

In [18]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      0.59      0.73        39
           1       0.88      0.99      0.93       117

    accuracy                           0.89       156
   macro avg       0.92      0.79      0.83       156
weighted avg       0.90      0.89      0.88       156



### Recall is from total no of actual positive how many were predicted right (23/23+16)=0.5897~0.59
### precision is from toal no of predicted positive how many are actual true (23/23+1)= 0.9583~0.96

In [24]:
svc=SVC()
svc.fit(x_train,y_train)
y_predict=svc.predict(x_test)
print(metrics.accuracy_score(y_test,y_predict))
print(metrics.confusion_matrix(y_test,y_predict))

0.9230769230769231
[[ 33   6]
 [  6 111]]


In [23]:
print(metrics.classification_report(y_test,y_predict))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85        39
           1       0.95      0.95      0.95       117

    accuracy                           0.92       156
   macro avg       0.90      0.90      0.90       156
weighted avg       0.92      0.92      0.92       156



StratifiedShuffleSplit is a combination of both ShuffleSplit and StratifiedKFold. 
Using StratifiedShuffleSplit the proportion of distribution of class labels is almost even between train and test dataset. 
The major difference between StratifiedShuffleSplit and StratifiedKFold (shuffle=True) is that in StratifiedKFold, the dataset is shuffled only once in the beginning and then split into the specified number of folds. This discards any chances of overlapping of the train-test sets. 

However, in StratifiedShuffleSplit the data is shuffled each time before the split is done and this is why there’s a greater chance that overlapping might be possible between train-test sets. 

ShuffleSplit will randomly sample your entire dataset during each iteration to generate a training set and a test set.

KFold will divide your data set into prespecified number of folds, and every sample must be in one and only one fold. A fold is a subset of your dataset.

In KFold, during each round you will use one fold as the test set and all the remaining folds as your training set. However, in ShuffleSplit, during each round n you should only use the training and test set from iteration n. As your data set grows, cross validation time increases, making shufflesplits a more attractive alternate. If you can train your algorithm, with a certain percentage of your data as opposed to using all k-1 folds, ShuffleSplit is an attractive option.

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
c_range=np.logspace(-2,10,13)
gamma_range=np.logspace(-9,3,13)
param_grid=dict(gamma=gamma_range,C=c_range)
grid=GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(x_train,y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
                         'gamma': array([1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02,
       1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])})

In [27]:
print("The best parameters are %s with a score of %0.2f"% (grid.best_params_, grid.best_score_))

The best parameters are {'C': 1000.0, 'gamma': 1e-09} with a score of 0.93
