<a href="https://colab.research.google.com/github/matthewarthur/kaggle_various/blob/master/titanic_base_unimproved.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import files
files.upload()

In [0]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [0]:
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle competitions download -c titanic

Downloading train.csv to /content
  0% 0.00/59.8k [00:00<?, ?B/s]
100% 59.8k/59.8k [00:00<00:00, 25.2MB/s]
Downloading test.csv to /content
  0% 0.00/28.0k [00:00<?, ?B/s]
100% 28.0k/28.0k [00:00<00:00, 22.7MB/s]
Downloading gender_submission.csv to /content
  0% 0.00/3.18k [00:00<?, ?B/s]
100% 3.18k/3.18k [00:00<00:00, 2.56MB/s]


In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [0]:
def write_submission_file(prediction, filename,
    path_to_sample=os.path.join(INPUT_DATA_DIR, 'gender_submission.csv')):
    submission = pd.read_csv(path_to_sample, index_col='PassengerId')
    submission['Survived'] = prediction
    submission.to_csv(filename)

In [0]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [8]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [0]:
#train_data.drop(['Name', 'Ticket'], axis=1, inplace=True)

In [12]:
train_data.info()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null objec

In [0]:
#using mode as replace method. should also test randoim
train_test = [train_data, test_data]
for dataset in train_test:
    # excluding some features
    dataset.drop(['PassengerId', 'Name','Ticket'], axis=1, inplace=True)
    dataset['Sex'] = dataset['Sex'].map({'male': 1, 'female': 0})

    #fill missing age with median
    dataset['Age'].fillna(train_data['Age'].mode(), inplace = True)

    #fill missing embarked with mode
    dataset['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace = True)

    #fill missing fare with median
    dataset['Fare'].fillna(train_data['Fare'].mode(), inplace = True)

In [20]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null int64
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    891 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB


In [0]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [59]:
train_target = train_data['Survived'] # saving target feature
base_feats = ['Sex', 'Pclass']
X_train_base = train_data[base_feats]
X_test_base = test_data[base_feats]
scores = cross_val_score(estimator=DecisionTreeClassifier(random_state=17),
                         X=X_train_base, y=train_target,
                         # i know that it's a big cv value,
                         # but the data is too small
                         # so we can afford it
                         cv=10, scoring='accuracy', n_jobs=-1)
print(f'The mean accuracy of our baseline is {np.mean(scores)*100} %')
base_est = DecisionTreeClassifier(random_state=17)
base_est.fit(X_train_base, train_target)
preds = base_est.predict(X_test_base)


The mean accuracy of our baseline is 77.33398592668256 %


In [62]:
preds.shape

(418,)

In [65]:
len(X_train_base)

891

In [0]:
np.savetxt("base_preds.csv", preds, delimiter=",")


In [0]:
from google.colab import files
files.download('base_preds.csv') 

In [0]:
for dataset in train_test:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [0]:
for dataset in train_test:
    dataset['IsAlone'] = (dataset['FamilySize'] == 1).astype('int')

In [0]:
get_deck = lambda x: (ord(x[0]) - ord('A') +1) if x[0] != 'T' else 1
train_data['Deck'] = train_data['Cabin'].map(get_deck, na_action='ignore')

In [0]:
train_data['Deck'].fillna(0, inplace=True) 


In [0]:
test_data['Deck'] = test_data['Cabin'].map(get_deck, na_action='ignore')
# test_data['Deck'].fillna(test_data['Pclass'].map(pclass_deck_modes, na_action=None),
#                           inplace=True)
test_data['Deck'].fillna(0, inplace=True)

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

In [0]:
from sklearn.model_selection import GridSearchCV


In [0]:
svc = SVC(random_state=17)


In [47]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
Pclass        418 non-null int64
Sex           418 non-null int64
Age           332 non-null float64
SibSp         418 non-null int64
Parch         418 non-null int64
Fare          417 non-null float64
Cabin         91 non-null object
Embarked      418 non-null object
FamilySize    418 non-null int64
IsAlone       418 non-null int64
Deck          418 non-null float64
dtypes: float64(3), int64(6), object(2)
memory usage: 36.0+ KB


In [0]:
feats_to_exclude = ['Age', 'Fare', 'Embarked', 'Title', 'Age_Class',
                    'AgeBin', 'FareBin', 'Cabin', 'Name', 'Age_ClassBin']
#X_train = train_data.drop(['Cabin','Embarked'], axis=1)
X_test = test_data.drop(['Cabin','Embarked'], axis=1)

In [0]:
X_test.fillna(method='pad', inplace=True)
X_train.fillna(method='pad', inplace=True)


In [53]:
Cs = np.linspace(0.001, 10, 20)
gammas = ['scale', 0.001, 0.005, 0.01, 0.5, 0.1, 1]
kernels = ['linear', 'rbf', 'sigmoid']
params = {'C': Cs, 'gamma' : gammas, 'kernel': kernels}
svc_grid = GridSearchCV(estimator=svc, param_grid=params, cv=5,
                        n_jobs=-1, scoring='accuracy', verbose=True)
svc_grid.fit(X_train, train_target)

Fitting 5 folds for each of 420 candidates, totalling 2100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 148 tasks      | elapsed:   27.0s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 1406 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2100 out of 2100 | elapsed:  1.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=17,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1.00000e-03, 5.27263e-01, 1.05353e+00, 1.57979e+00, 2.10605e+00,
       2.63232e+00, 3.15858e+00, 3.68484e+00, 4.21111e+00, 4.73737e+00,
       5.26363e+00, 5.78989e+00, 6.31616e+00, 6.84242e+00, 7.36868e+00,
       7.89495e+00, 8.42121e+00, 8.94747e+00, 9.47374e+00, 1.00000e+01]), 'gamma': ['scale', 0.001, 0.005, 0.01, 0.5, 0.1, 1], 'kernel': ['linear', 'rbf', 'sigmoid']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=True)

In [56]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
Pclass        418 non-null int64
Sex           418 non-null int64
Age           418 non-null float64
SibSp         418 non-null int64
Parch         418 non-null int64
Fare          418 non-null float64
FamilySize    418 non-null int64
IsAlone       418 non-null int64
Deck          418 non-null float64
dtypes: float64(3), int64(6)
memory usage: 29.5 KB


In [54]:
svc_grid.best_score_, svc_grid.best_params_


(1.0, {'C': 0.5272631578947369, 'gamma': 'scale', 'kernel': 'linear'})

In [57]:
best_svc = SVC(**svc_grid.best_params_, random_state=17)
best_svc.fit(X_train, train_target)
best_svc_preds = best_svc.predict(test_data)
write_submission_file(best_svc_preds, 'untuned_svc_submission.csv')

ValueError: ignored