# Kaggle Titanic

## dataset description

Variable | Definition | Key
---------|------------|-----
Survival | survival | 0 = No, 1 = Yes
pclass | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd
sex	| Sex	
Age | Age in years
sibsp |	# of siblings / spouses aboard the Titanic	
parch | # of parents / children aboard the Titanic	
ticket | Ticket number	
fare	| Passenger fare	
cabin	| Cabin number	
embarked	| Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

**Variable Notes**

**pclass:** A proxy for socio-economic status (SES)

1st = Upper
2nd = Middle
3rd = Lower

**age:** Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

**sibsp:** The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

**parch:** The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

# imports

In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
from statistics import mode

from sklearn.model_selection import train_test_split
from sklearn import preprocessing


%matplotlib inline

# init data

In [21]:
import memory_usage

Reducing the memory usage of dataframe.
https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

In [22]:
link = 'D:\STUDY\practice\\titanic'
train = 'train.csv'
test = 'test.csv'

In [23]:
df = pd.read_csv(link+'\\'+train)
#df = memory_usage.import_data(link+'\\'+train)
test_df = memory_usage.import_data(link+'\\'+test)

df = df.rename(columns={c:str.lower(c) for c in df.columns})
test_df = test_df.rename(columns={c:str.lower(c) for c in test_df.columns})

del c

Memory usage of dataframe is 0.04 MB
Memory usage after optimization is: 0.04 MB
Decreased by -27.7%


NameError: name 'c' is not defined

# exploring data

## basic info

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.median()

## basic contributions

In [None]:
df.columns.to_list()

In [None]:
df['survived'].value_counts().plot(kind = 'pie')
plt.show()

In [None]:
df['pclass'].value_counts().plot(kind = 'pie')
plt.show()

In [None]:
df['sex'].value_counts().plot(kind='pie')
plt.show()

In [None]:
plt.figure(figsize=(20, 2))
sns.boxplot(x='age', data = df[['age']])
plt.show()
df[['age']].describe()

In [None]:
df['sibsp'].value_counts().plot(kind='bar')
plt.show()

In [None]:
df['parch'].value_counts().plot(kind='bar')
plt.show()

In [None]:
df['embarked'].value_counts().plot(kind='pie')
plt.show()

## survived

In [None]:
df['survived'].value_counts().plot(kind = 'pie')
plt.show()

In [None]:
df.groupby(['sex'])['survived'].value_counts().unstack(0).plot(kind='bar', stacked=True)
plt.show()

In [None]:
df.groupby(['pclass'])['survived'].value_counts().unstack().plot(kind='bar', stacked=True)
plt.show()

In [None]:
df[df['sex']=='female'].groupby(['pclass'])['survived'].value_counts().unstack(0).plot(kind='bar', stacked=True)
plt.show()

In [None]:
df[df['sex']=='male'].groupby(['pclass'])['survived'].value_counts().unstack(0).plot(kind='bar', stacked=True)
plt.show()

In [None]:
df.groupby(['embarked'])['survived'].value_counts().unstack().plot(kind='bar', stacked=True)
plt.show()

In [None]:
df.groupby(['embarked', 'pclass'])['survived'].value_counts().unstack().plot(kind='bar', stacked=True)
plt.show()

## missing values

In [None]:
df['embarked'] = df['embarked'].fillna(mode(df['embarked']))

### age

In [None]:
plt.figure(figsize=(20, 2))
sns.boxplot(x='age', data = df[['age']])
plt.show()
df[['age']].describe()

In [None]:
plt.figure(figsize=(20, 4))
sns.boxplot(y='sex', x='age', hue='sex', data = df[['age', 'sex']])
plt.show()

In [None]:
df.groupby(['pclass', 'sex'])[['age']].median()

In [None]:
df.groupby(['embarked', 'sex'])[['age']].median()

In [None]:
df.groupby(['pclass', 'embarked', 'sex'])[['age']].median()

### cabin

In [None]:
df[ (df['cabin'].isna()==0)]['cabin']

In [None]:
df[ (df['cabin'].isna()==0) & (df['cabin'].str.contains('F'))][['cabin']]

## women and children first

In [None]:
df[ (df['age'].isna()==1)].head()

In [None]:
df[ (df['ticket']=='347077')]

In [None]:
plt.figure(figsize=(20, 5))
sns.boxplot(y='sex', x='age', hue='survived', data = df[['survived', 'age', 'sex']])
plt.show()

## fare

In [None]:
df.groupby('pclass')['fare'].median()

In [None]:
df[ (df['fare']==0)]

In [None]:
df[ (df['ticket']=='LINE')]

# features

## 0

In [None]:
df[df['name'].str.contains('Master')]

In [None]:
q25 = df[ (df['sex']=='male') & (df['name'].str.contains('Master')) & (df['pclass']==3)][['Age']].quantile(.25)
q75 = df[ (df['sex']=='male') & (df['name'].str.contains('Master')) & (df['pclass']==3)][['Age']].quantile(.75)

df.loc[ (df['age'].isna()==1) & (df['Sex']=='male') & (df['name'].str.contains('Master')), 'age'] = np.array(range(int(q25[0]),int(q75[0]+1),round(int(q75[0]-q25[0])/4)))
df.loc[df['age'].isnull(), 'age'] = df.groupby(['Pclass', 'Sex', 'Embarked'])['Age'].transform('median')

In [None]:
df['is_female'] = df['sex'].apply(lambda x: (x=='female')*1)
df.drop(['sex'], axis=1, inplace=True)

c_variables = ['pclass', 'embarked']
for c in c_variables:
    dummies = pd.get_dummies(df[c], prefix=c.lower())
    df = pd.concat([df, dummies], axis=1)
    df.drop([c], axis=1, inplace=True)
    
del dummies, c_variables

## columns drop

In [None]:
', '.join(df.columns.tolist())

In [19]:
df.columns.tolist()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [None]:
columns_to_fit = 'Survived, Age, SibSp, Parch, is_female, pclass_1, pclass_2, pclass_3, embarked_C, embarked_Q, embarked_S'.replace(' ','').split(',')
columns_to_fit

In [None]:
df = df[df.columns.intersection(columns_to_fit)]

## scaling

In [None]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
columns_to_scale = ['age', 'sibsp', 'parch']
for c in columns_to_scale:
    df[[c]] = min_max_scaler.fit_transform(df[[c]])

In [None]:
df.head(3)

# modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC

k_fold= KFold(n_splits=10, shuffle=True, random_state=1)

In [None]:
df.head()

In [None]:
target = 'survived'
X = df.loc[:, df.columns != target]
y = df[target]