In [73]:
# data wrangling
import os
import pandas as pd
import numpy as np

# data visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve
from sklearn.tree import plot_tree

sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import scipy
from scipy.cluster import hierarchy as hc

from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype

In [82]:
df = pd.read_csv(os.path.join("titanic_data","train.csv"))
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [83]:
#add new columns
df["Name lenght"] = None
df["Name word count"] = None
print(("Some Name".count(" ")))

for i in range(len(df["Name"])):
    name = df["Name"][i]
    df["Name lenght"][i] = len(name)
    df["Name word count"][i] = int(name.count(" ")) + 1

df.head()

1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Name lenght"][i] = len(name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Name word count"][i] = int(name.count(" ")) + 1


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name lenght,Name word count
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,23,4
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,51,7
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,22,3
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,44,7
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,24,4


In [84]:
df.isna().sum() / len(df)

PassengerId        0.000000
Survived           0.000000
Pclass             0.000000
Name               0.000000
Sex                0.000000
Age                0.198653
SibSp              0.000000
Parch              0.000000
Ticket             0.000000
Fare               0.000000
Cabin              0.771044
Embarked           0.002245
Name lenght        0.000000
Name word count    0.000000
dtype: float64

In [60]:
df.drop(columns = ["PassengerId", "Name"],inplace = True)
df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Ticket       object
Fare        float64
Embarked     object
dtype: object

In [61]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,female,35.0,1,0,113803,53.1,S
4,0,3,male,35.0,0,0,373450,8.05,S


In [62]:
for column in df.columns:

    if str(df[column].dtypes) == "int64":

        print(column, df[column].unique())
        df[column] = df[column].astype("category")

df.dtypes

Survived [0 1]
Pclass [3 1 2]
SibSp [1 0 3 4 2 5 8]
Parch [0 1 2 5 3 4 6]


Survived    category
Pclass      category
Sex           object
Age          float64
SibSp       category
Parch       category
Ticket        object
Fare         float64
Embarked      object
dtype: object

In [63]:
#fill and drop na
df.isna().sum() / len(df)

Survived    0.000000
Pclass      0.000000
Sex         0.000000
Age         0.198653
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.000000
Embarked    0.002245
dtype: float64

In [64]:
def fill_numeric_na(df, method : str = "mean"):
    """method: mean or median"""

    for column in df.keys():

        if is_numeric_dtype(df[column]):

            if method == "mean":
                df[column].fillna(df[column].mean(), inplace = True)
            elif method == "median":
                df[column].fillna(df[column].median(), inplace = True)

    return df

In [67]:
#cleared DAta Frame
df = fill_numeric_na(df, method = "mean")
df.isna().sum() / len(df)

Survived    0.000000
Pclass      0.000000
Sex         0.000000
Age         0.000000
SibSp       0.000000
Parch       0.000000
Ticket      0.000000
Fare        0.000000
Embarked    0.002245
dtype: float64

In [68]:
#check, how much data was lost
df["Embarked"].isna() == False

0      True
1      True
2      True
3      True
4      True
       ... 
886    True
887    True
888    True
889    True
890    True
Name: Embarked, Length: 891, dtype: bool