In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, norm, gamma
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [2]:
!pip install graphviz



In [53]:
!pip install sklearn



In [2]:
from IPython.display import SVG
from graphviz import Source
from IPython.display import display

In [3]:
seed = 42
np.random.seed(seed)

In [4]:
data = pd.read_csv('train.csv')

In [5]:
y_raw = data['Survived']
X_raw = data.drop(['Survived'], axis=1)

In [6]:
X_raw.shape, y_raw.shape

((891, 11), (891,))

In [8]:
drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked']

In [9]:
X_raw.drop(columns=drop, inplace=True)

In [10]:
def binarize(x):
    if x == 'female':
        return 1
    else:
        return 0

In [11]:
X_raw['Sex'] = X_raw['Sex'].apply(lambda x: binarize(x))

In [12]:
X_raw['Age'].fillna(X_raw['Age'].median(), inplace=True)

In [13]:
X, X_test, y, y_test = train_test_split(X_raw, y_raw, test_size=0.15, 
                                                    random_state=seed)

In [14]:
X.shape, X_test.shape, y.shape, y_test.shape

((757, 6), (134, 6), (757,), (134,))

In [15]:
y.isna().sum()

0

In [16]:
kf = KFold(n_splits=5, shuffle=True)

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
#комментарий после разбора
#split делает то о чем мы говорили на cross-validation - разбиваем выборку не просто на 80 и 20, а делим ее на n_splits = 5
#и после перемешиваем эти части поочередно применяя их в train и test, таким образом если мы возьмем n_splits = 5 то получим
#5 комбинаций/проходок для которых (train, test)={[1234;5], [2345;1], [1345;2], [1245;3], [1235;4]}
#
#
ex = {'t1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6], 't2':[11, 1, 2, 14, 4, 5, 7, 16, 9, 57, 3, 8]}
ex = pd.DataFrame(ex)
kfex = KFold(n_splits=3)
for train_index, test_index in kfex.split(ex):
     print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [ 4  5  6  7  8  9 10 11] TEST: [0 1 2 3]
TRAIN: [ 0  1  2  3  8  9 10 11] TEST: [4 5 6 7]
TRAIN: [0 1 2 3 4 5 6 7] TEST: [ 8  9 10 11]


In [21]:
#enumerate действует следующим образом - для любого списка\массива\прочее он перед каждым элементом проставляет 
#индекс\порядковый номер начиная с 0
#в данном цикле он train_index передает номера а test_index передает индексы двух выборок: train и test 
#(в виде двух списков внутри списка)
#если бы в цикле была только одна переменная index то в нее он бы передавал список с тремя элементами: 
#(номер, array[индексы train], array[индексы test])
#
ex = {'t1': [1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 7, 6], 't2':[11, 1, 2, 14, 4, 5, 7, 16, 9, 57, 3, 8]}
ex = pd.DataFrame(ex)
kfex = KFold(n_splits=3)
for train_index, test_index in enumerate(kfex.split(ex)):
     print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: 0 TEST: (array([ 4,  5,  6,  7,  8,  9, 10, 11]), array([0, 1, 2, 3]))
TRAIN: 1 TEST: (array([ 0,  1,  2,  3,  8,  9, 10, 11]), array([4, 5, 6, 7]))
TRAIN: 2 TEST: (array([0, 1, 2, 3, 4, 5, 6, 7]), array([ 8,  9, 10, 11]))


In [22]:
result = pd.DataFrame()
for num, idxs in enumerate(kf.split(X)):
    print(f'Fold № {num}')
    train_index = idxs[0]
    val_index = idxs[1]
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    model = LogisticRegression()
    model.fit(X_train, y_train)
    dummy = {'Fold №': int(num), 'Accuracy': model.score(X_val, y_val)}
    result = result.append(dummy, ignore_index=True) 
    

Fold № 0
Fold № 1
Fold № 2
Fold № 3
Fold № 4


In [23]:
#чтобы стоблец Fold № выглядел не как 0.0, 1.0 а просто 0 и 1
result['Fold №'] = result['Fold №'].astype(int)

In [24]:
result

Unnamed: 0,Accuracy,Fold №
0,0.763158,0
1,0.782895,1
2,0.827815,2
3,0.788079,3
4,0.781457,4


In [25]:
print(f'mean {result.Accuracy.mean()}, std {result.Accuracy.std()}')
print(f'{result.Accuracy.mean()} \u00B1 {result.Accuracy.std()}')

mean 0.7886807249912862, std 0.02381645802680093
0.7886807249912862 ± 0.02381645802680093


In [26]:
X.shape, X_test.shape

((757, 6), (134, 6))

In [27]:
model = DecisionTreeClassifier(criterion='entropy', 
                               max_depth=3, min_samples_leaf=10,
                              min_samples_split=10)
model.fit(X, y)


plt.figure(figsize=(13, 5))
tree.plot_tree(model, fontsize=10, feature_names=list(X), filled=True);
#need upgrade skleran from 0.0 to 0.21

AttributeError: module 'sklearn.tree' has no attribute 'plot_tree'

<Figure size 936x360 with 0 Axes>

In [28]:
X.shape, X_train.shape, X_val.shape, X_test.shape

((757, 6), (606, 6), (151, 6), (134, 6))

In [None]:
#ex1 test models

In [29]:
#тест модели RandomForest for GridSearchCV
#%%time
parameters = {'n_estimators': range(1, 100, 10), 'max_features': range(1, 7), 'max_depth': range(1, 10, 2)} #будет 20 запусков
#model = RandomForestClassifier(n_jobs=-1) 
model = RandomForestClassifier()
clf = GridSearchCV(model, parameters)
clf.fit(X, y)
#n_jobs - numbers of processor to run in parallel

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': range(1, 100, 10), 'max_features': range(1, 7), 'max_depth': range(1, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
clf.cv_results_['mean_test_score'].max() 
#parameters = {'n_estimators': range(1, 100, 10), 'max_features': range(1, 7), 'max_depth': range(1, 10, 2)}

0.8256274768824307

In [31]:
#тест модели LogisticRegression for GridSearchCV
parameters = {'C': np.arange(.1, 100, .1), 'penalty':['l2', 'l1']}
model = LogisticRegression()
clf = GridSearchCV(model, parameters)
clf.fit(X, y)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([ 0.1,  0.2, ..., 99.8, 99.9]), 'penalty': ['l2', 'l1']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [32]:
clf.cv_results_['mean_test_score'].max() 
#parameters = {'C': np.arange(.1, 10,.1}

0.7992073976221928

In [33]:
#тест модели RandomForest for RandomizedSearchCV
distributions = dict(n_estimators = range(1, 100, 10), max_features = range(1, 7), max_depth = range(1, 10, 2))
model = RandomForestClassifier()
clf = RandomizedSearchCV(model, distributions, random_state=seed, cv=5)
clf.fit(X, y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': range(1, 100, 10), 'max_features': range(1, 7), 'max_depth': range(1, 10, 2)},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [34]:
clf.cv_results_['mean_test_score'].max() 
#distributions = dict(n_estimators = range(1, 100, 10), max_features = range(1, 7), max_depth = range(1, 10, 2))

0.8229854689564069

In [35]:
#тест модели LogisticRegression for RandomizedSearchCV
distributions = dict(C=uniform(loc=0, scale=4))
model = LogisticRegression()
clf = RandomizedSearchCV(model, distributions, random_state=seed, cv=5)
clf.fit(X, y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021EF76186C8>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [36]:
clf.cv_results_['mean_test_score'].max() 
#distributions = dict(C=uniform(loc=0, scale=4)) #работает лучше без пенальти

0.8018494055482166

In [None]:
#ex2 encoding 

In [37]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [47]:
drop = ['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin']
X_emb = data

In [48]:
X_emb = data.drop(columns=drop, inplace=True)

KeyError: "['PassengerId' 'Survived' 'Name' 'Ticket' 'Cabin'] not found in axis"

In [51]:
X_emb.shape

(891, 7)

In [55]:
X_emb.isna().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [58]:
X_emb['Sex'] = X_emb['Sex'].apply(lambda x: binarize(x))
X_emb['Age'].fillna(X_emb['Age'].median(), inplace=True)
X_emb['Embarked'].fillna('S', inplace=True) #заменили на S так как NaN всего две а S более 600

In [59]:
X_emb.isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [65]:
label_en = LabelEncoder()
X_emb['Embarked'] = label_en.fit_transform(X_emb['Embarked'])

In [66]:
X_emb.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,22.0,1,0,7.25,2
1,1,1,38.0,1,0,71.2833,0
2,3,1,26.0,0,0,7.925,2
3,1,1,35.0,1,0,53.1,2
4,3,0,35.0,0,0,8.05,2


In [77]:
#np.reshape(X_emb['Embarked'], (891,-1)).astype(int)

Exception: Data must be 1-dimensional

In [87]:
one_hot_en = OneHotEncoder(sparse=False)
X_emb_mat = pd.DataFrame(one_hot_en.fit_transform(X_emb['Embarked'][:,np.newaxis]))

In [89]:
X_emb_mat.head()

Unnamed: 0,0,1,2
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0


In [103]:
X_emb['EmbS'] = X_emb_mat[0]
X_emb['EmbC'] = X_emb_mat[1]
X_emb['EmbG'] = X_emb_mat[2]

In [109]:
X_emb = X_emb.drop(['Embarked'], axis=1)
X_emb.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,EmbS,EmbC,EmbG
0,3,0,22.0,1,0,7.25,0.0,0.0,1.0
1,1,1,38.0,1,0,71.2833,1.0,0.0,0.0
2,3,1,26.0,0,0,7.925,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1,0.0,0.0,1.0
4,3,0,35.0,0,0,8.05,0.0,0.0,1.0


In [110]:
X_emb_train, X_emb_test, y, y_test = train_test_split(X_emb, y_raw, test_size=0.15, 
                                                    random_state=seed)

In [115]:
#тест модели RandomForest for GridSearchCV
parameters = {'n_estimators': range(1, 100, 10), 'max_features': range(1, 10), 'max_depth': range(1, 10, 2)} #будет 20 запусков
model = RandomForestClassifier()
clf = GridSearchCV(model, parameters)
clf.fit(X_emb_train, y)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': range(1, 100, 10), 'max_features': range(1, 10), 'max_depth': range(1, 10, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [116]:
clf.cv_results_['mean_test_score'].max() 
#parameters = {'n_estimators': range(1, 100, 10), 'max_features': range(1, 7), 'max_depth': range(1, 10, 2)}

0.8256274768824307

In [120]:
#тест модели LogisticRegression for GridSearchCV
parameters = {'C': np.arange(.1, 100, .1), 'penalty':['l2', 'l1']}
model = LogisticRegression()
clf = GridSearchCV(model, parameters)
clf.fit(X_emb_train, y)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([ 0.1,  0.2, ..., 99.8, 99.9]), 'penalty': ['l2', 'l1']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [121]:
clf.cv_results_['mean_test_score'].max() 
#parameters = {'C': np.arange(.1, 10,.1}

0.8044914134742405

In [139]:
#тест модели RandomForest for RandomizedSearchCV
distributions = dict(n_estimators = range(1, 100, 10), max_features = range(1, 10), max_depth = range(1, 10, 2))
model = RandomForestClassifier()
clf = RandomizedSearchCV(model, distributions, random_state=seed, cv=5)
clf.fit(X_emb_train, y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': range(1, 100, 10), 'max_features': range(1, 10), 'max_depth': range(1, 10, 2)},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [140]:
clf.cv_results_['mean_test_score'].max() 
#distributions = dict(n_estimators = range(1, 100, 10), max_features = range(1, 7), max_depth = range(1, 10, 2))

0.8229854689564069

In [124]:
#тест модели LogisticRegression for RandomizedSearchCV
distributions = dict(C=uniform(loc=0, scale=4))
model = LogisticRegression()
clf = RandomizedSearchCV(model, distributions, random_state=seed, cv=5)
clf.fit(X_emb_train, y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000021EF7CD5C48>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [125]:
clf.cv_results_['mean_test_score'].max() 
#distributions = dict(C=uniform(loc=0, scale=4)) #работает лучше без пенальти

0.8005284015852048

In [None]:
### mean_test_score улучшилось при X_enb_train в тестах 
# LogisticRegression for GridSearchCV 

In [None]:
# для заданияна метрики использую модель LogisticRegression for GridSearchCV

In [None]:
#ex3 metrics

In [141]:
#тест модели LogisticRegression for GridSearchCV
parameters = {'C': np.arange(.1, 100, .1), 'penalty':['l2', 'l1']}
model = LogisticRegression()
clf = GridSearchCV(model, parameters)
clf.fit(X_emb_train, y)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([ 0.1,  0.2, ..., 99.8, 99.9]), 'penalty': ['l2', 'l1']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [142]:
preds = clf.predict(X_emb_test)

In [148]:
print(f1_score(y_test, preds, average='macro'),
      f1_score(y_test, preds, average='micro'),
      f1_score(y_test, preds, average='weighted'))

0.7971115769857909 0.8059701492537313 0.8040718837677442


In [149]:
print(recall_score(y_test, preds, average='macro'),
      recall_score(y_test, preds, average='micro'),
      recall_score(y_test, preds, average='weighted'))

0.7930402930402931 0.8059701492537313 0.8059701492537313


In [150]:
print(precision_score(y_test, preds, average='macro'),
      precision_score(y_test, preds, average='micro'),
      precision_score(y_test, preds, average='weighted'))

0.8047619047619048 0.8059701492537313 0.805543710021322


In [152]:
accuracy_score(y_test, preds)

0.8059701492537313

In [137]:
data_test = pd.read_csv('test.csv')

preds_data = pd.DataFrame(data_test['PassengerId'][:418])

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

X_test_kaggle = data_test[features]

X_test_kaggle['Fare'].fillna(X_test_kaggle['Fare'].median(), inplace=True)
X_test_kaggle['Sex'] = X_test_kaggle['Sex'].apply(lambda x: binarize(x)) 
X_test_kaggle['Age'].fillna(X_test_kaggle['Age'].median(), inplace=True)
#X_test_kaggle['Embarked'].fillna('S', inplace=True)
X_test_kaggle['Embarked'] = label_en.fit_transform(X_test_kaggle['Embarked'])
X_temp = pd.DataFrame(one_hot_en.fit_transform(X_test_kaggle['Embarked'][:,np.newaxis]))
X_test_kaggle['EmbS'] = X_temp[0]
X_test_kaggle['EmbC'] = X_temp[1]
X_test_kaggle['EmbG'] = X_temp[2]

X_test_kaggle = X_test_kaggle.drop(['Embarked'], axis=1)

preds = clf.predict(X_test_kaggle)

preds_data['Survived'] = preds
preds_data.to_csv('Preds.csv', index=False)