### Wczytanie danych treningowych

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

df = pd.read_csv('https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv')
print(df.shape)
df.head()

(891, 11)


Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Przygotowanie danych

In [2]:
df.dtypes

survived      int64
pclass        int64
name         object
sex          object
age         float64
sibsp         int64
parch         int64
ticket       object
fare        float64
cabin        object
embarked     object
dtype: object

In [3]:
y = df['survived'].copy()

X = df[['pclass', 'age', 'sibsp', 'parch', 'fare']].copy()
X['sex'] = pd.get_dummies(df['sex']).iloc[:,1] # sex=1 oznacza sex='male'

print(X.shape, y.shape)
X.head()

(891, 6) (891,)


Unnamed: 0,pclass,age,sibsp,parch,fare,sex
0,3,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,0
2,3,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,0
4,3,35.0,0,0,8.05,1


In [4]:
sum(y.isna())

0

In [5]:
np.sum(X.isna(), axis=0)

pclass      0
age       177
sibsp       0
parch       0
fare        0
sex         0
dtype: int64

In [6]:
y = y.loc[~X['age'].isna()]
X = X.dropna()

y.shape, X.shape

((714,), (714, 6))

### Trenowanie modelu RandomForest i optymalizacja jego parametrów

In [7]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

{'n_estimators': [200, 650, 1100, 1550, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 35, 60, 85, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [8]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X, y)
rf_random.best_params_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.9s finished


{'n_estimators': 1550,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 85,
 'bootstrap': True}

### Obliczenie szansy przeżycia dla:
##### chłopca 2 lata, 3 rodzeństwa, 1 rodzic, zapłacił 10 funtów i podróżuje 3 klasą

In [9]:
test = pd.DataFrame([[3, 2, 3, 2, 10, 1]], columns=X.columns)
rf_random.predict(test)

array([0.44198117])

##### Prawdopodobieństwo przeżycia chłopca = 0.44