# Линейные модели, SVM и деревья решений

## Задание 

1. Выберите набор данных (датасет) для решения задачи классификации или регресии.
2. В случае необходимости проведите удаление или заполнение пропусков и кодирование категориальных признаков.
3. С использованием метода train_test_split разделите выборку на обучающую и тестовую.
4. Обучите 1) одну из линейных моделей, 2) SVM и 3) дерево решений. Оцените качество моделей с помощью трех подходящих для задачи метрик. Сравните качество полученных моделей.
5. Произведите для каждой модели подбор одного гиперпараметра с использованием GridSearchCV и кросс-валидации.
6. Повторите пункт 4 для найденных оптимальных значений гиперпараметров. Сравните качество полученных моделей с качеством моделей, полученных в пункте 4. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [2]:
data=pd.read_csv("Admission_Predict.csv")
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


### Проверим наличие и количество <br>пропущенных значений 

In [3]:
data.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

### Проверим наличие категориальных данных

In [4]:
cats = [col for col in data.columns if 
        data[col].dtype=="object"]
print(len(cats))

0


### Разделим данные на тренировочную и тестовую выборку 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    data, data['Research'], test_size=0.2, random_state=42)

### Обучим модели и проверим метриками 

#### Метод стохастического градиентного спуска

In [6]:
sgd = SGDClassifier().fit(X_train, y_train)



In [7]:
res_SGD = sgd.predict(X_test)
print(accuracy_score(y_test, res_SGD))
print(precision_score(y_test, res_SGD))
print(recall_score(y_test, res_SGD))

0.4125
0.0
0.0


  'precision', 'predicted', average, warn_for)


#### Метод опорных векторов 

In [8]:
sv = SVC(gamma='auto').fit(X_train, y_train)

In [9]:
res_SVC = sv.predict(X_test)
print(accuracy_score(y_test, res_SVC))
print(precision_score(y_test, res_SVC))
print(recall_score(y_test, res_SVC))

0.6125
0.6212121212121212
0.8723404255319149


#### Деревья решений

In [10]:
DT = DecisionTreeClassifier(
    random_state=1, 
    max_depth=0.75).fit(X_train, y_train)

In [11]:
res_DT = DT.predict(X_test)
print(accuracy_score(y_test, res_DT))
print(precision_score(y_test, res_DT))
print(recall_score(y_test, res_DT))

0.5875
0.5875
1.0


### Обучим  модели на кросс валидации

In [12]:
scores_sgd = cross_val_score(SGDClassifier(),
                             X_train, y_train, 
                             cv=2)
np.mean(scores_sgd)



0.49062500000000003

In [13]:
scores_svc = cross_val_score(SVC(gamma='auto'), 
                         X_train, y_train, cv=2)
np.mean(scores_svc)

0.6156250000000001

In [14]:
scores_dt = cross_val_score(DecisionTreeClassifier(), 
                         X_train, y_train, cv=2)
np.mean(scores_dt)

1.0

### Подберем гиперпараметры и обучим модели с использованием гиперпараметров

#### Стохастический градиентный спуск 

In [15]:
parameters = {'alpha':[0.5,0.4,0.3,0.2,0.1]}
clf_gs_sgd = GridSearchCV(SGDClassifier(), 
                          parameters, cv=2, 
                          scoring='accuracy')
clf_gs_sgd.fit(X_train, y_train)



GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.5, 0.4, 0.3, 0.2, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [16]:
clf_gs_sgd.best_params_

{'alpha': 0.4}

In [17]:
sgd_new = SGDClassifier(
    alpha=0.5).fit(X_train, 
                   y_train)



In [18]:
res_sgd_new = sgd_new.predict(X_test)
print(accuracy_score(y_test, res_sgd_new))
print(precision_score(y_test, res_sgd_new))
print(recall_score(y_test, res_sgd_new))

0.525
0.5633802816901409
0.851063829787234


#### Метод опорных векторов 

In [19]:
parameters = {'gamma':[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]}
clf_gs_svm_svc = GridSearchCV(SVC(), parameters, cv=2, scoring='accuracy')
clf_gs_svm_svc.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'gamma': [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [20]:
clf_gs_svm_svc.best_params_

{'gamma': 0.1}

In [21]:
svm_svc_new = SVC(gamma=0.1).fit(X_train, 
                                 y_train)

In [22]:
res_svc_new = svm_svc_new.predict(X_test)
print(accuracy_score(y_test, res_svc_new))
print(precision_score(y_test, res_svc_new))
print(recall_score(y_test, res_svc_new))

0.6125
0.625
0.851063829787234


#### Деревья решений

In [23]:
parameters = {'min_impurity_decrease':[0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1]}
clf_gs_decision_tree = GridSearchCV(DecisionTreeClassifier(), parameters, cv=2, scoring='accuracy')
clf_gs_decision_tree.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'min_impurity_decrease': [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [24]:
clf_gs_decision_tree.best_params_

{'min_impurity_decrease': 0.4}

In [25]:
decision_tree_new = DecisionTreeClassifier(
    random_state=1, 
    min_impurity_decrease=0.4, 
    max_depth=0.75).fit(X_train, y_train)

In [26]:
res_dt_new = decision_tree_new.predict(X_test)
print(accuracy_score(y_test, res_dt_new))
print(precision_score(y_test, res_dt_new))
print(recall_score(y_test, res_dt_new))

0.5875
0.5875
1.0
