In [1]:
import pandas as pd
df = pd.read_csv("College.csv")
df.columns

Index(['Private', 'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc',
       'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books',
       'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend',
       'Grad.Rate'],
      dtype='object')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
X, y = df.iloc[:, 1:].values, df.iloc[:, 0].values
# male -> 1
# female -> 0
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(X_train.shape)

(543, 17)


In [3]:
from sklearn.svm import LinearSVC,SVC
classifier = LinearSVC(penalty='l1', dual=False)
classifier.fit(X_train,y_train)

LinearSVC(dual=False, penalty='l1')

In [4]:
classifier.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l1',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

In [5]:
y_predict = classifier.predict(X_test)
classifier.score(X_test,y_test)

0.9230769230769231

In [6]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_predict,y_test))

[[ 56  10]
 [  8 160]]


In [7]:
svc_classifier = SVC()
svc_classifier.fit(X_train,y_train)
svc_classifier.score(X_test,y_test)

0.9230769230769231

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X, y = df.iloc[:, 1:].values, df.iloc[:, 0].values
X = scaler.fit_transform(X)
target_encoder = LabelEncoder()
y = target_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.shape)

(621, 17)


In [9]:
classifier = SVC()
classifier.fit(X_train,y_train)
classifier.score(X_test,y_test)

0.9423076923076923

In [10]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
# StratifiedShuffleSplit is a combination of both ShuffleSplit and StratifiedKFold. 
# Using StratifiedShuffleSplit the proportion of distribution of class labels is almost even between train and test dataset.

C_range = np.logspace(-2, 10, 13) # The numpy.logspace() function returns number spaces evenly w.r.t interval on a log scale.
gamma_range = np.logspace(-9, 3, 13) # The numpy.logspace() function returns number spaces evenly w.r.t interval on a log scale.
# Regularization parameter. The strength of the regularization is inversely proportional to C. 
# Must be strictly positive. The penalty is a squared l2 penalty.
# Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.

param_grid = dict( gamma=gamma_range,C=C_range)
grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(X_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
       1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10]),
                         'gamma': array([1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02,
       1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])})

In [14]:
# help(np.logspace)

In [11]:
grid.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': None,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(),
 'iid': 'deprecated',
 'n_jobs': None,
 'param_grid': {'gamma': array([1.e-09, 1.e-08, 1.e-07, 1.e-06, 1.e-05, 1.e-04, 1.e-03, 1.e-02,
         1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
  'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03, 1.e+04, 1.e+05,
         1.e+06, 1.e+07, 1.e+08, 1.e+09, 1.e+10])},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': False,
 'scoring': None,
 'verbose': 0}

In [12]:
print("The best parameters are %s with a score of %0.2f"
% (grid.best_params_, grid.best_score_))

The best parameters are {'C': 1000000.0, 'gamma': 1e-07} with a score of 0.94
