# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from ipywidgets import interact, interact_manual

In [3]:
from data_processing import fill_demean_scale

# Read in `train` and `test` data

In [4]:
train_df = pd.read_csv("./train.csv", index_col=0)

In [5]:
test_df = pd.read_csv("./test.csv", index_col=0)

# Explore dataset

## dtypes

In [None]:
train_df.isnull().sum()

In [None]:
train_df.dtypes

We want to predict whether a passenger survived the Titanic or not. The target or label in this problem is therefore `Survived`.

This is a categorical variable (1 or 0) and therefore we are dealing with a (binary) classification problem.

On what types of information are our predictions to be based?

## schema

`Pclass`: 1st, 2nd, or 3rd class ticket (int, categorical variable)

`Name`: full name, string

`Sex`: takes values `male` or `female`

`Age`: age in years, float

`SibSp`: the number of siblings or spouses that passenger had onboard, int

`Parch`: the number of parents and children that passenger had onboard, int

`Ticket`: the ticket number (non-standardised, a few different formats, str

`Fare`: the cost of the tickets (in standardised currency???), float

`Cabin`: the cabin code of each passenger, string

`Embarked`: where they got on the boat (Cherbourg, Queenstown, Southampton), str (categorical)

In [None]:
@interact
def show_me_unique_values(column=sorted(list(train_df.columns))):
    return train_df[column].unique()

## `Age`

The mean passenger age was 29.7, with 44 0 to 5 year-olds.

In [None]:
train_df.Age.mean()

In [None]:
train_df[train_df.Age <= 5.0]['Name'].count()

In [None]:
train_df.Age.plot(kind='hist', bins=20)

## `Pclass`

There were actually fewer second class tickets than first class tickets. Who woulda thought...

In [None]:
assert (train_df.Pclass.value_counts() / train_df.Pclass.size).sum() == 1.0

train_df.Pclass.value_counts() / train_df.Pclass.size

In [None]:
train_df.Pclass.plot(kind='hist')

## `SibSp`

Most of the passengers travelled with neither siblings nor spouses.

This may mean that most travelled alone, or that they travelled with a parent or child.

In [None]:
train_df.SibSp.value_counts()

In [None]:
train_df.SibSp.plot(kind='hist')

## `Parch`

Most indeed travelled alone.

In [None]:
train_df.Parch.value_counts()

In [None]:
train_df.Parch.plot(kind='hist')

## `Fare`

In [None]:
train_df.plot(
    kind='scatter', y='Fare', x='Age', c='Pclass', cmap='Set1',legend=True
)

At first glance it seems that `Fare` doesn't tell us much about `Age`.

## `Cabin`

In [None]:
# train_df['Cabin_prefix'] = train_df.Cabin.apply(
#     lambda x: str(x)[0] if str(x) != 'nan' else 'nan'
# )

In [None]:
train_df.Cabin_prefix.unique()

There are seven different `Cabin_prefix`es from `A` through `F` plus `T`. What did they mean?

There are also some dirty data entries in `Cabin` such as "F G63".

## `Embarked`

In [None]:
embarked_mapping = {'Q':1, 'S':2, 'C':3}

train_df['Embarked'] = train_df['Embarked'].map(embarked_mapping)

In [None]:
train_df.plot(kind='scatter', x='Age', y='Fare', c='Embarked', cmap='Set1')

## `test_df`

In [None]:
test_df.describe()

# Process `train_df` data

## `Sex`

In [6]:
sex_mapping = {'male':1,'female':0}
train_df.Sex = train_df.Sex.map(sex_mapping)

## `Age`, `Fare`, `Parch`, `SibSp`

In [7]:
for column in ['Age', 'Fare', 'Parch', 'SibSp']:
    train_df[column] = fill_demean_scale(df=train_df, column=column)

## Drop `Embarked` nulls

In [8]:
train_df = train_df[~train_df.Embarked.isnull()]
embarked_mapping = {'Q':1, 'S':2, 'C':3}

train_df['Embarked'] = train_df['Embarked'].map(embarked_mapping)

## Drop `Cabin`

In [9]:
train_df = train_df.drop(columns='Cabin', axis=1)

## Checks

In [10]:
train_df.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Embarked'],
      dtype='object')

In [11]:
train_df.isnull().sum().sum()

0

# Process `test_df` data

## `Sex`

In [12]:
sex_mapping = {'male':1,'female':0}
test_df.Sex = test_df.Sex.map(sex_mapping)

## `Age`, `Fare`, `Parch`, `SibSp`

In [13]:
for column in ['Age', 'Fare', 'Parch', 'SibSp']:
    test_df[column] = fill_demean_scale(df=test_df, column=column)

## Drop `Embarked` nulls

In [14]:
test_df = test_df[~test_df.Embarked.isnull()]

embarked_mapping = {'Q':1, 'S':2, 'C':3}
test_df['Embarked'] = test_df['Embarked'].map(embarked_mapping)

## Drop `Cabin`

In [15]:
test_df = test_df.drop(columns='Cabin', axis=1)

In [16]:
test_df.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Embarked'],
      dtype='object')

## Checks

In [17]:
test_df.dtypes

Pclass        int64
Name         object
Sex           int64
Age         float64
SibSp       float64
Parch       float64
Ticket       object
Fare        float64
Embarked      int64
dtype: object

In [18]:
test_df.isnull().sum()

Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Embarked    0
dtype: int64

In [19]:
assert len(train_df.columns) - 1 == len(test_df.columns)

In [20]:
train_df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,889.0,889.0,889.0,889.0,889.0,889.0,889.0,889.0
mean,0.382452,2.311586,0.649044,-0.000574,0.000147,0.000143,-0.00021,2.102362
std,0.48626,0.8347,0.477538,0.16296,0.137963,0.13446,0.097003,0.515181
min,0.0,1.0,0.0,-0.367921,-0.065376,-0.063599,-0.062858,1.0
25%,0.0,2.0,0.0,-0.096747,-0.065376,-0.063599,-0.047447,2.0
50%,0.0,3.0,1.0,0.0,-0.065376,-0.063599,-0.034646,2.0
75%,1.0,3.0,1.0,0.066611,0.059624,-0.063599,-0.00235,2.0
max,1.0,3.0,1.0,0.632079,0.934624,0.936401,0.937142,3.0


In [21]:
test_df.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.636364,3.187243e-18,-3.187243e-18,-2.92164e-18,-1.062414e-18,2.133971
std,0.841838,0.481622,0.1666166,0.1120949,0.1090477,0.1089934,0.580452
min,1.0,0.0,-0.3969747,-0.05592105,-0.04359383,-0.06953964,1.0
25%,1.0,0.0,-0.09590651,-0.05592105,-0.04359383,-0.05412807,2.0
50%,3.0,1.0,0.0,-0.05592105,-0.04359383,-0.04132692,2.0
75%,3.0,1.0,0.07223275,0.06907895,-0.04359383,-0.008055735,2.0
max,3.0,1.0,0.6030253,0.9440789,0.9564062,0.9304604,3.0


In [22]:
test_df.to_pickle("./store/Run1_test_df.pkl")

In [23]:
train_df.to_pickle("./store/Run1_train_df.pkl")

# `X_train, y_train, X_test`

In [44]:
train_df = pd.read_pickle("./store/Run1_train_df.pkl")
test_df = pd.read_pickle("./store/Run1_test_df.pkl")

In [45]:
cols_to_drop = ["Name", "Ticket", "Survived"]

features = [col for col in train_df.columns if col not in cols_to_drop]

X_train, y_train = train_df[features], train_df.Survived

In [46]:
X_test = test_df[features]

In [47]:
cols_to_drop = ["Name", "Ticket", "Survived"]

def prepare_train_and_test_data():
    train_df = pd.read_pickle("./store/Run1_train_df.pkl")
    test_df = pd.read_pickle("./store/Run1_test_df.pkl")
    features = [col for col in train_df.columns if col not in cols_to_drop]
    X_train, y_train = train_df[features], train_df.Survived
    X_test = test_df[features]
    return X_train, y_train, X_test

# Run #1 : `LogisticRegression`

In [26]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [27]:
clf.fit(X_train, y_train)

LogisticRegression()

In [28]:
predictions = clf.predict(X_test)

In [29]:
test_df['Survived'] = predictions

In [30]:
test_df.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",1,0.055749,-0.055921,-0.043594,330911,-0.054258,1,0
893,3,"Wilkes, Mrs. James (Ellen Needs)",0,0.220591,0.069079,-0.043594,363272,-0.055877,2,0
894,2,"Myles, Mr. Thomas Francis",1,0.418402,-0.055921,-0.043594,240276,-0.050631,1,0
895,3,"Wirz, Mr. Albert",1,-0.043157,-0.055921,-0.043594,315154,-0.052632,2,0
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,-0.109094,0.069079,0.067517,3101298,-0.045556,2,1


In [31]:
test_df['Survived'].value_counts()

0    263
1    155
Name: Survived, dtype: int64

In [32]:
test_df["Survived"].to_csv("./predictions/Run1_predicted_values.csv")

# Run #2: `svm.SVC`

In [37]:
from sklearn import svm

In [38]:
clf = svm.SVC(kernel='linear', C=100)

In [39]:
clf.fit(X_train, y_train)

SVC(C=100, kernel='linear')

In [41]:
predictions = clf.predict(X_test)

In [42]:
test_df['Survived'] = predictions

In [43]:
test_df.Survived.value_counts()

0    266
1    152
Name: Survived, dtype: int64

In [56]:
C = [0.01, 0.1, 1, 10, 100, 1000]

for C_value in C:
    X_train, y_train, X_test = prepare_train_and_test_data()
    
    clf = svm.SVC(kernel='rbf', C=C_value, random_state=1337)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    X_test['Survived'] = predictions
    print(C_value,":\n", X_test['Survived'].value_counts() / X_test['Survived'].size)
    X_test['Survived'].to_csv(f"./predictions/Run2_C{C_value}_predictions.csv")

0.01 :
 0    1.0
Name: Survived, dtype: float64
0.1 :
 0    0.636364
1    0.363636
Name: Survived, dtype: float64
1 :
 0    0.641148
1    0.358852
Name: Survived, dtype: float64
10 :
 0    0.722488
1    0.277512
Name: Survived, dtype: float64
100 :
 0    0.727273
1    0.272727
Name: Survived, dtype: float64
1000 :
 0    0.691388
1    0.308612
Name: Survived, dtype: float64
