DESCRIPTION

Load the data from “college.csv” that has attributes collected about private and public colleges for a particular year. Predict the private/public status of the colleges from other attributes. Use LabelEncoder to encode the target variable to numerical form. Split the data such that 20% of the data is set aside for testing. Fit a linear SVM from scikit-learn and observe the accuracy. [Hint: Use Linear SVC] Preprocess the data using StandardScalar and fit the same model again. Observe the change in accuracy.  Use scikit-learn’s gridsearch to select the best hyper-parameter for a non-linear SVM. Identify the model with the best score and its parameters. [Hint: Refer to model_selection module of Scikit learn]
 

**Objective**: Employ SVM from scikit-learn for binary classification and measure the impact of preprocessing data and hyper-parameter search using grid search.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('College.csv')
df.head()

Unnamed: 0,Private,Apps,Accept,Enroll,Top10perc,Top25perc,F.Undergrad,P.Undergrad,Outstate,Room.Board,Books,Personal,PhD,Terminal,S.F.Ratio,perc.alumni,Expend,Grad.Rate
0,Yes,1660,1232,721,23,52,2885,537,7440,3300,450,2200,70,78,18.1,12,7041,60
1,Yes,2186,1924,512,16,29,2683,1227,12280,6450,750,1500,29,30,12.2,16,10527,56
2,Yes,1428,1097,336,22,50,1036,99,11250,3750,400,1165,53,66,12.9,30,8735,54
3,Yes,417,349,137,60,89,510,63,12960,5450,450,875,92,97,7.7,37,19016,59
4,Yes,193,146,55,16,44,249,869,7560,4120,800,1500,76,72,11.9,2,10922,15


In [3]:
# check for null variables
df.isna().sum()

Private        0
Apps           0
Accept         0
Enroll         0
Top10perc      0
Top25perc      0
F.Undergrad    0
P.Undergrad    0
Outstate       0
Room.Board     0
Books          0
Personal       0
PhD            0
Terminal       0
S.F.Ratio      0
perc.alumni    0
Expend         0
Grad.Rate      0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Private      777 non-null    object 
 1   Apps         777 non-null    int64  
 2   Accept       777 non-null    int64  
 3   Enroll       777 non-null    int64  
 4   Top10perc    777 non-null    int64  
 5   Top25perc    777 non-null    int64  
 6   F.Undergrad  777 non-null    int64  
 7   P.Undergrad  777 non-null    int64  
 8   Outstate     777 non-null    int64  
 9   Room.Board   777 non-null    int64  
 10  Books        777 non-null    int64  
 11  Personal     777 non-null    int64  
 12  PhD          777 non-null    int64  
 13  Terminal     777 non-null    int64  
 14  S.F.Ratio    777 non-null    float64
 15  perc.alumni  777 non-null    int64  
 16  Expend       777 non-null    int64  
 17  Grad.Rate    777 non-null    int64  
dtypes: float64(1), int64(16), object(1)
memory usage: 

In [7]:
# create X and y variables
X = df.drop(columns=['Private'])
y = df['Private']
print(X.shape)
print(y.shape)

(777, 17)
(777,)


In [8]:
# enconde variable y to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [9]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=123)

In [10]:
# fit linear SVM and observe accuracy
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# predictions on test data
preds = svm_model.predict(X_test)

# confusion matrix
print(confusion_matrix(y_test, preds))

# classification report
print(classification_report(y_test, preds))

[[ 19  18]
 [  0 119]]
              precision    recall  f1-score   support

           0       1.00      0.51      0.68        37
           1       0.87      1.00      0.93       119

    accuracy                           0.88       156
   macro avg       0.93      0.76      0.80       156
weighted avg       0.90      0.88      0.87       156





In [11]:
# standardize X data set and retrain the model (data preprocessing)
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X

array([[-3.46881819e-01, -3.21205453e-01, -6.35089011e-02, ...,
        -8.67574189e-01, -5.01910084e-01, -3.18251941e-01],
       [-2.10884040e-01, -3.87029908e-02, -2.88584214e-01, ...,
        -5.44572203e-01,  1.66109850e-01, -5.51261842e-01],
       [-4.06865631e-01, -3.76317928e-01, -4.78121319e-01, ...,
         5.85934748e-01, -1.77289956e-01, -6.67766793e-01],
       ...,
       [-2.33895071e-01, -4.23771558e-02, -9.15087008e-02, ...,
        -2.21570217e-01, -2.56241250e-01, -9.59029170e-01],
       [ 1.99171118e+00,  1.77256262e-01,  5.78332661e-01, ...,
         2.12019418e+00,  5.88797079e+00,  1.95359460e+00],
       [-3.26765760e-03, -6.68715889e-02, -9.58163623e-02, ...,
         4.24433755e-01, -9.87115613e-01,  1.95359460e+00]])

In [12]:
# split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=123)

In [13]:
# fit model on training set
svm_model.fit(X_train, y_train)

# predictions on test set
svm_model.predict(X_test)

# confusion matrix
print(confusion_matrix(y_test, preds))

# classification report
print(classification_report(y_test, preds))

[[ 19  18]
 [  0 119]]
              precision    recall  f1-score   support

           0       1.00      0.51      0.68        37
           1       0.87      1.00      0.93       119

    accuracy                           0.88       156
   macro avg       0.93      0.76      0.80       156
weighted avg       0.90      0.88      0.87       156





In [29]:
# use grid search to find best hyper-parameter for model
#grid_params = {'kernel':('linear'), 'C':[1, 10]}
grid_params = {'C': [0.1,1,10,100]}

In [30]:
grid = GridSearchCV(svm_model, grid_params)
grid.fit(X_train, y_train)



GridSearchCV(estimator=LinearSVC(), param_grid={'C': [0.1, 1, 10, 100]})

In [31]:
# predictions from grid model
grids_preds = grid.predict(X_test)

# confusion_matrix
print(confusion_matrix(y_test, grids_preds))

# classiclassification_report
print(classification_report(y_test, grids_preds))

[[ 31   6]
 [  3 116]]
              precision    recall  f1-score   support

           0       0.91      0.84      0.87        37
           1       0.95      0.97      0.96       119

    accuracy                           0.94       156
   macro avg       0.93      0.91      0.92       156
weighted avg       0.94      0.94      0.94       156

