# Обучаем первые классификаторы в sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Готовим обучающую и тестовую выборки

In [3]:
import pandas as pd
import numpy as np

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [12]:
bioresponce.shape

(3751, 1777)

In [10]:
bioresponce.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [33]:
biocut = bioresponce[:300]

In [14]:
biocut.shape

(100, 1777)

In [34]:
y = biocut.Activity.values

Обрежем до ста строчек

In [35]:
X = biocut.iloc[:, 1:]

In [18]:
X.shape

(100, 1776)

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Строим модель и оцениваем качество

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [17]:
type(preds)

numpy.ndarray

In [18]:
10 // 9

1

In [15]:
print(sum(preds == y_test) / len(preds))

0.7560581583198708


In [13]:
print(sum(preds == y_test) / float(len(preds)))

0.7560581583198708


In [16]:
from sklearn.metrics import accuracy_score

print(accuracy_score(preds, y_test))

0.7560581583198708


### Качество на кросс-валидации

In [19]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(model, X_train, y_train, cv=5))

[ 0.74404762  0.73956262  0.72310757  0.75099602  0.75896414]


In [20]:
print(cross_val_score(model, X_train, y_train, cv=5).mean())

0.743335594477


### Пробуем другие классификаторы

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [19]:
%%time

models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(accuracy_score(preds, y_test), model)

0.7189014539579968 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.7027463651050081 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.7447495961227787 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.7802907915993538 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impuri

## Нахождение оптимальных параметров

### KNeighborsClassifier

In [22]:
KNclassifier = KNeighborsClassifier()

In [23]:
KNclassifier.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [24]:
params_grid = {
     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
     'leaf_size': np.linspace(20,40,3),
     'n_neighbors': range(4,7),
     'p': range(1,3),
     'weights': ['uniform','distance']}

#### Grid Search

In [25]:
grid_cv=GridSearchCV(KNclassifier,params_grid)

In [37]:
%%time
grid_cv.fit(X_train,y_train)

Wall time: 1min 32s


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': array([ 20.,  30.,  40.]), 'n_neighbors': range(4, 7), 'p': range(1, 3), 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [38]:
print(grid_cv.best_score_)
print(grid_cv.best_params_)

0.568888888889
{'algorithm': 'auto', 'leaf_size': 20.0, 'n_neighbors': 4, 'p': 1, 'weights': 'distance'}


#### Randomized Grid Search

In [39]:
randomized_grid_cv=RandomizedSearchCV(KNclassifier, params_grid, n_iter=50)

In [40]:
%%time
randomized_grid_cv.fit(X_train,y_train)

Wall time: 31.7 s


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params=None, iid=True, n_iter=50, n_jobs=1,
          param_distributions={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': array([ 20.,  30.,  40.]), 'n_neighbors': range(4, 7), 'p': range(1, 3), 'weights': ['uniform', 'distance']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [41]:
print(randomized_grid_cv.best_score_)
print(randomized_grid_cv.best_params_)

0.568888888889
{'weights': 'distance', 'p': 1, 'n_neighbors': 6, 'leaf_size': 20.0, 'algorithm': 'brute'}


### DecisionTreeClassifier

In [42]:
DTclassifier = DecisionTreeClassifier()

In [43]:
DTclassifier.get_params

<bound method BaseEstimator.get_params of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')>

In [46]:
DTparams_grid = {
     'splitter': ['best', 'random'],
     'max_depth': range(5,10),
     'criterion': ['gini','entropy']}

#### Grid Search

In [47]:
DTgrid_cv=GridSearchCV(DTclassifier,DTparams_grid)

In [48]:
%%time
DTgrid_cv.fit(X_train,y_train)

Wall time: 4.18 s


GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'splitter': ['best', 'random'], 'max_depth': range(5, 10), 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [49]:
print(DTgrid_cv.best_score_)
print(DTgrid_cv.best_params_)

0.662222222222
{'criterion': 'gini', 'max_depth': 8, 'splitter': 'random'}


#### Randomized Grid Search

In [54]:
DTrandomized_grid_cv=RandomizedSearchCV(DTclassifier, DTparams_grid, n_iter=10)

In [55]:
%%time
DTrandomized_grid_cv.fit(X_train,y_train)

Wall time: 2.44 s


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'splitter': ['best', 'random'], 'max_depth': range(5, 10), 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [56]:
print(DTrandomized_grid_cv.best_score_)
print(DTrandomized_grid_cv.best_params_)

0.697777777778
{'splitter': 'random', 'max_depth': 6, 'criterion': 'entropy'}
