Moje pierwsze rozwiązanie problemu https://www.kaggle.com/c/titanic.

## 1. Importowanie potrzebnych modułów i bibliotek.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

## 2. Pobranie i pierwsze spojrzenie na dane treningowe.

In [2]:
pd.set_option('display.max_columns', 999)

In [3]:
titanic_data_train = pd.read_csv("train.csv", index_col=0)

In [4]:
titanic_data_train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [5]:
titanic_data_train["Age"].plot.hist(bins=20)

<matplotlib.axes._subplots.AxesSubplot at 0x275a6aad888>

In [6]:
titanic_data_train.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## 3. Znalezienie kolumn z brakami w danych.

In [7]:
[col for col in titanic_data_train.columns if titanic_data_train[col].isnull().any()]

['Age', 'Cabin', 'Embarked']

In [8]:
titanic_data_train['Age'].isnull().sum()

177

In [9]:
titanic_data_train['Cabin'].isnull().sum()

687

In [10]:
titanic_data_train['Embarked'].isnull().sum()

2

## 4. Rozwiązanie problemu braku danych.

In [11]:
reduced_tdt = titanic_data_train.drop('Cabin', axis=1)
reduced_tdt.dropna(subset=['Embarked'], inplace=True)
reduced_tdt['Age'].fillna(reduced_tdt['Age'].mean(), inplace=True)

## 5. Usunięcie niepotrzebnych danych, 

In [12]:
reduced_tdt.drop(['Name','Ticket'],axis=1, inplace=True)

## 6. Transformacja nazw kategorii na liczby.

In [13]:
s = (reduced_tdt.dtypes == 'object')
object_cols = list(s[s].index)
object_cols

['Sex', 'Embarked']

In [14]:
label_encoder = LabelEncoder()
for col in object_cols:
    reduced_tdt[col] = label_encoder.fit_transform(reduced_tdt[col])

## 7. Podział danych treningowych.

In [15]:
y = reduced_tdt.Survived
X = reduced_tdt.drop('Survived',axis=1)

In [16]:
X

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,1,22.000000,1,0,7.2500,2
2,1,0,38.000000,1,0,71.2833,0
3,3,0,26.000000,0,0,7.9250,2
4,1,0,35.000000,1,0,53.1000,2
5,3,1,35.000000,0,0,8.0500,2
...,...,...,...,...,...,...,...
887,2,1,27.000000,0,0,13.0000,2
888,1,0,19.000000,0,0,30.0000,2
889,3,0,29.642093,1,2,23.4500,2
890,1,1,26.000000,0,0,30.0000,0


In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

## 8. Zastosowanie algorytmu lasów losowych do przewidywania, czy dany pasażer przeżył.

In [18]:
model = RandomForestClassifier(n_estimators=100,random_state=0)
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [19]:
predics = model.predict(X_valid)

In [20]:
val_mae = mean_absolute_error(y_valid, predics)

In [21]:
predics

array([1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1], dtype=int64)

In [22]:
perc=0
for index,y in enumerate(y_valid):
    if y == predics[index]:
        perc+=1
        
print(perc/len(y_valid))

0.7752808988764045


## 9. sprawdzenie skutecznosiu zastosowanego modelu na danych testowych.

In [23]:
titanic_data_test = pd.read_csv('test.csv', index_col=0)
titanic_data_test.drop(['Cabin','Name','Ticket'],axis=1,inplace=True)

In [24]:
s = (titanic_data_test.dtypes == 'object')
object_cols = list(s[s].index)
object_cols
for col in object_cols:
    titanic_data_test[col] = label_encoder.fit_transform(titanic_data_test[col])

In [25]:
titanic_data_test['Age'].fillna(reduced_tdt['Age'].mean(), inplace=True)

In [26]:
[col for col in titanic_data_test.columns if titanic_data_test[col].isnull().any()]

['Fare']

In [27]:
titanic_data_test['Fare'].fillna(titanic_data_test['Fare'].mean(),inplace=True)

In [28]:
answer = model.predict(titanic_data_test)

In [29]:
answer

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [30]:
output = pd.DataFrame({'PassengerId': titanic_data_test.index,
                       'Survived': answer})
output.to_csv('submission.csv', index=False)