In [481]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [482]:
train = pd.read_csv("datasets/train.csv")
test = pd.read_csv("datasets/test.csv")

train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [483]:
train.columns= train.columns.str.lower()
test.columns= test.columns.str.lower()

| Column     | Description                                                                                                              |
|------------|--------------------------------------------------------------------------------------------------------------------------|
| `survival` | Survival: <br><ul><li>`0`: no</li><li>`1`: yes</li></ul>                                                                 |
| `pclass`   | Ticket class: <br> <ul><li>`1`: 1st</li><li>`2`: 2nd</li><li>`3`: 3rd</li></ul>                              |
| `sex`      | sex                                                              |
| `age`      | Age in years       |
| `sibsp`    | # of siblings / spouses aboard the Titanic                                       |
| `parch`    | # of parents / children aboard the Titanic     |
| `ticket`   | Ticket number                                                                             |
| `fare`     | Passenger fare |
| `cabin`    | Cabin number                                      |
| `embarked` | Port of Embarkation: <br><ul><li>`C`: Cherbourg</li><li>`Q`: Queenstown</li><li>`S`: Southampton</li></ul>                                     |



**Variable Notes:** <br>
`pclass`: A proxy for socio-economic status (SES)
- 1st = Upper
- 2nd = Middle
- 3rd = Lower <br>

`age`: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5 <br>

`sibsp`: The dataset defines family relations in this way...
- Sibling = brother, sister, stepbrother, stepsister
- Spouse = husband, wife (mistresses and fiancés were ignored) <br>

`parch`: The dataset defines family relations in this way...
- Parent = mother, father
- Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [484]:
train.describe()

Unnamed: 0,passengerid,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [485]:
train.isnull().sum()

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

добавити фічу alone. подорожував з кимось чи один

In [486]:
train.duplicated().sum()

0

In [487]:
train['age'] = (
    train.groupby('sex')['age'].apply(lambda group: group.fillna(group.median())).reset_index(level=0)['age']
)
test['age'] = (
    test.groupby('sex')['age'].apply(lambda group: group.fillna(group.median())).reset_index(level=0)['age']
)

In [488]:
train.drop(columns='cabin', inplace=True)
test.drop(columns='cabin', inplace=True)

In [489]:
test['sex'].replace({'female':0,'male':1},inplace=True)
train['sex'].replace({'female':0,'male':1},inplace=True)

In [490]:
train = pd.get_dummies(train,columns=['embarked'],prefix='embarked')
test = pd.get_dummies(test,columns=['embarked'],prefix='embarked')

In [491]:
train = train.dropna(axis=0)

In [492]:
test.isnull().sum()

passengerid    0
pclass         0
name           0
sex            0
age            0
sibsp          0
parch          0
ticket         0
fare           1
embarked_C     0
embarked_Q     0
embarked_S     0
dtype: int64

def ticket_number(x):
    return x.split(" ")[-1]

def ticket_item(x):
    items = x.split(" ")
    if len(items) == 1:
        return "NONE"
    return "_".join(items[0:-1])

train["ticket_number"] = train["ticket"].apply(ticket_number)

test["ticket_number"] = test["ticket"].apply(ticket_number)

In [493]:
train = train.drop(columns=['passengerid', 'name'])
test = test.drop(columns=['name'])

In [494]:
train['sibsp_bool'] = train['sibsp'].map(
    lambda s: False if s == 0 else True
)
test['sibsp_bool'] = test['sibsp'].map(
    lambda s: False if s == 0 else True
)

In [495]:
train['parch_bool'] = train['parch'].map(
    lambda p: False if p == 0 else True
)
test['parch_bool'] = test['parch'].map(
    lambda p: False if p == 0 else True
)

In [496]:
train = train.drop(columns=['ticket'])
test = test.drop(columns=['ticket'])

In [497]:
test.isnull().sum()

passengerid    0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           1
embarked_C     0
embarked_Q     0
embarked_S     0
sibsp_bool     0
parch_bool     0
dtype: int64

In [498]:
test['fare'].fillna(test['fare'].median(), inplace = True)

In [499]:
# Machine Learning Libraries
from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score ,precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import  classification_report, confusion_matrix

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC

In [500]:
x = train.drop('survived',axis =1)
y = train['survived']
x

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,sibsp_bool,parch_bool
0,3,1,22.0,1,0,7.2500,False,False,True,True,False
1,1,0,38.0,1,0,71.2833,True,False,False,True,False
2,3,0,26.0,0,0,7.9250,False,False,True,False,False
3,1,0,35.0,1,0,53.1000,False,False,True,True,False
4,3,1,35.0,0,0,8.0500,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0000,False,False,True,False,False
887,1,0,19.0,0,0,30.0000,False,False,True,False,False
888,3,0,27.0,1,2,23.4500,False,False,True,True,True
889,1,1,26.0,0,0,30.0000,True,False,False,False,False


In [501]:
# Select numerical columns
num_cols = ['age', 'fare', 'parch', 'sibsp']

# Create scaler object
scaler = StandardScaler()

# Fit scaler on selected columns
scaler.fit(x[num_cols])

# Transform selected columns with scaler
x[num_cols] = scaler.transform(x[num_cols])

# Fit scaler on numeric columns
scaler.fit(test[num_cols])

# Transform numeric columns with scaler
test[num_cols] = scaler.transform(test[num_cols])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [502]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

In [503]:
gbc= GradientBoostingClassifier()
scores = cross_val_score(gbc, x_train, y_train, cv=5, scoring='accuracy')
print(f"{gbc} accuracy: {scores}")

# Fit the model to the full training set and make predictions on the test set
gbc.fit(x_train, y_train)
y_pred = gbc.predict(x_test)
# Evaluate the model on the test set
acc = accuracy_score(y_test, y_pred)
print (acc)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

GradientBoostingClassifier() accuracy: [0.82517483 0.81818182 0.73239437 0.85211268 0.84507042]
0.8379888268156425


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [504]:
#Store the PassengerId column in a separate variable

PassengerId = test['passengerid']

# drop PassengerId column from the test set
test.drop('passengerid',axis=1,inplace=True)

test_pred = gbc.predict(test)
submission = pd.DataFrame({'PassengerId': PassengerId, 'Survived': test_pred})
submission.to_csv('submission.csv', index=False)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [505]:
test_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,