# Обучаем первые классификаторы в sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Готовим обучающую и тестовую выборки

In [1]:
import pandas as pd

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [2]:
bioresponce.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [3]:
y = bioresponce.Activity.values

In [4]:
X = bioresponce.iloc[:, 1:]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Строим модель и оцениваем качество

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [8]:
type(preds)

numpy.ndarray

In [9]:
10 // 9

1

In [10]:
print(sum(preds == y_test) / len(preds))

0.75605815832


In [11]:
print(sum(preds == y_test) / float(len(preds)))

0.75605815832


In [12]:
from sklearn.metrics import accuracy_score

print(accuracy_score(preds, y_test))

0.75605815832


### Качество на кросс-валидации

In [13]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(model, X_train, y_train, cv=5))

[ 0.74404762  0.73956262  0.72310757  0.75099602  0.75896414]


In [14]:
print(cross_val_score(model, X_train, y_train, cv=5).mean())

0.743335594477


### Пробуем другие классификаторы

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [16]:
%%time

models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(accuracy_score(preds, y_test), model)

0.718901453958 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.697092084006 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.741518578352 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.778675282714 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
 

## Опциональное задание:

Попробуйте разные классификаторы с разными параметрами и постарайтесь добиться максимального качества на тестовой выборке

In [17]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

### Задание модели

In [18]:
classifiers = {'KNeighborsClassifier': KNeighborsClassifier(),
          'DecisionTreeClassifier': DecisionTreeClassifier(),
          'LinearSVC': LinearSVC(),
          'RandomForestClassifier': RandomForestClassifier(), 
          'GradientBoostingClassifier': GradientBoostingClassifier()}

### Генерация сетки

In [19]:
grid_parameters_of_classifiers = {'KNeighborsClassifier' : {'n_neighbors': np.arange(1, 10, 1), \
                                              'metric': ['euclidean', 'cityblock']},
                    'DecisionTreeClassifier' : {'max_depth': [5, None], \
                                                'max_features': np.arange(1, 10), \
                                                'criterion': ['gini', 'entropy']},
                    'LinearSVC' : {'C': [0.001, 0.01, 0.1, 1, 10], \
                                   'intercept_scaling': [0.001, 0.01, 0.1, 1], \
                                   'max_iter': [100, 1000]},
                    'RandomForestClassifier' : {'max_depth': [1, 5, 10, 15, 20, 25, 30], \
                                                'min_samples_leaf': [1, 3, 4, 7, 8, 10], \
                                                'warm_start': [True, False]},
                    'GradientBoostingClassifier' : {'learning_rate': [0.1, 0.5], \
                                                    'max_depth': [3, 5], \
                                                    'min_samples_leaf': [5, 10], \
                                                    'max_features': [1.0, 0.5, 0.1]}}

### Подбор параметров и оценка качества

#### Grid search

In [20]:
%%time

for name_of_classifier, model_of_classifier in classifiers.items():
    gs = GridSearchCV(estimator=model_of_classifier, param_grid=grid_parameters_of_classifiers[name_of_classifier], n_jobs=-1)
    gs.fit(X_train, y_train)
    
    print(gs.best_score_, " - ", gs.best_estimator_)

0.730202944688  -  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cityblock',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
0.678074015121  -  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
0.74850775965  -  LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=0.001, loss='squared_hinge', max_iter=100,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.760445682451  -  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,

#### Randomized grid search


In [21]:
%%time

for name_of_classifier, model_of_classifier in classifiers.items():
    rs = RandomizedSearchCV(estimator=model_of_classifier, param_distributions=grid_parameters_of_classifiers[name_of_classifier], n_iter=5, n_jobs=-1)
    rs.fit(X_train, y_train)
    
    print(rs.best_score_, " - ", rs.best_estimator_)

0.724233983287  -  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='cityblock',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')
0.66812574612  -  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=6, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
0.74850775965  -  LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.755670513331  -  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
    

### Вывод:

time(GridSearchCV) > time(RandomizedGridCV), 
accuracy(GridSearchCV) > accuracy(RandomizedGridCV).
При увеличении n_iter в RandomizedGridCV: accuracy(RandomizedGridCV) --> accuracy(GridSearchCV).

GridSearchCV делает полный перебор по сетке параметров, поэтому хорошо использовать, только если объем выборки и размер сетки не слишком велики.

RandomizedGridCV делает случайный перебор по сетке параметров, поэтому можно использовать при любом размере сетки и выборки.