In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [31]:
# you can either predict age based on other features or simply put
# avg age by class, convert age into ranges, calculate family size based on sibling, parent child values
def impute_age(cols):
    Age = cols[0]
    Pclass =  cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 28
        else:
            return 24
    else:
        return Age

def standardize_age(data):
    data['Age'] = data[['Age', 'Pclass']].apply(impute_age, axis =1)
    data['Age'] = data['Age'].astype(int)
    data.loc[data['Age'] <= 11, 'Age'] = 0
    data.loc[(data['Age'] > 11) & (data['Age'] <= 18), 'Age'] = 1
    data.loc[(data['Age'] > 18) & (data['Age'] <= 22), 'Age'] = 2
    data.loc[(data['Age'] > 22) & (data['Age'] <= 27), 'Age'] = 3
    data.loc[(data['Age'] > 27) & (data['Age'] <= 33), 'Age'] = 4
    data.loc[(data['Age'] > 33) & (data['Age'] <= 46), 'Age'] = 5
    data.loc[(data['Age'] > 46) & (data['Age'] <= 66), 'Age'] = 6
    data.loc[data['Age'] >= 66, 'Age'] = 7

def standardize_fare(data):
    data.loc[data['Fare'].isnull(), 'Fare'] = 0
    data.loc[data['Fare'] <= 7, 'Fare'] = 0
    data.loc[(data['Fare'] > 7) & (data['Fare'] <= 14), 'Fare'] = 1
    data.loc[(data['Fare'] > 14) & (data['Fare'] <= 31), 'Fare'] = 2
    data.loc[(data['Fare'] > 31) & (data['Fare'] <= 99), 'Fare'] = 3
    data.loc[(data['Fare'] > 99) & (data['Fare'] <= 250), 'Fare'] = 4
    data.loc[data['Fare'] >= 250, 'Fare'] = 5
    data['Fare'] = data['Fare'].astype(int)

def standardize_title(data):
    dataset_title = [i.split(",")[1].split(".")[0].strip() for i in data["Name"]]
    data["Title"] = pd.Series(dataset_title)
    data["Title"] = data["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data["Title"] = data["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
    data["Title"] = data["Title"].astype(int)
    
def calculate_family_size(data):
    data['FamilySize'] = data['Parch'] + data['SibSp'] + 1
    data['Singleton'] = data['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    data['SmallFamily'] = data['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    data['LargeFamily'] = data['FamilySize'].map(lambda s: 1 if 5 <= s <= 8 else 0)
    

In [123]:
#Read training and testing data
train = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')
combine = [train, test]
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [124]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [109]:
PassengerId = train['PassengerId']
test_pid = test['PassengerId']

In [110]:
for each in combine:
    standardize_age(each)
    standardize_fare(each)
    standardize_title(each)
    calculate_family_size(each)

In [111]:
sex = pd.get_dummies(train['Sex'], prefix='gender')
embark = pd.get_dummies(train['Embarked'] , prefix='em')
pclass = pd.get_dummies(train['Pclass'],prefix='pl')
title = pd.get_dummies(train['Title'],prefix='title')
sex_t = pd.get_dummies(test['Sex'])
embark_t = pd.get_dummies(test['Embarked'] , prefix ='em')
pclass_t = pd.get_dummies(test['Pclass'], prefix='pl')
title_t = pd.get_dummies(test['Title'], prefix='title')

In [112]:
train = pd.concat([train, sex, embark, pclass, title], axis = 1)
train.drop(['Sex','Embarked','Pclass','Title'], axis=1, inplace=True)
test = pd.concat([test, sex_t, embark_t, pclass_t, title_t], axis = 1)
test.drop(['Sex','Embarked','Pclass','Title'], axis=1, inplace=True)

In [113]:
#Drop data with modified and null value cols
train = train.drop(['Ticket', 'Cabin'], axis=1)
test = test.drop(['Ticket', 'Cabin'], axis=1)
train = train.drop(['Name', 'PassengerId','SibSp','Parch'], axis=1)
test = test.drop(['Name', 'PassengerId','SibSp','Parch'], axis=1)

In [118]:
#all entries are standardized and not null
# test.info()
# train.info()
test.head()

Unnamed: 0,Age,Fare,FamilySize,Singleton,SmallFamily,LargeFamily,female,male,em_C,em_Q,em_S,pl_1,pl_2,pl_3,title_0,title_1,title_2,title_3
0,5,1,1,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0
1,6,0,2,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0
2,6,1,1,1,0,0,0,1,0,1,0,0,1,0,0,0,1,0
3,3,1,1,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0
4,2,1,3,0,1,0,1,0,0,0,1,0,0,1,0,1,0,0


In [115]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [116]:
y = train['Survived']
x = train.drop('Survived', axis =1)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.3, random_state=101)

In [119]:
# Logistic Regression output
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(x_train, y_train)
t_predictions = logmodel.predict(x_test)
print(classification_report(y_test, t_predictions))
logmodel_acc = round(logmodel.score(x_train, y_train)*100, 2)
log_test_pred = logmodel.predict(test)
final_data = pd.DataFrame({'PassengerId' : test_pid, 'Survived' : log_test_pred})
final_data.info()
final_data.to_csv('D:\\git_work\\git_repos\\data_analysis_and_visualization\\titanic_log_reg_pred.csv', index=False, columns = ['PassengerId','Survived'])

              precision    recall  f1-score   support

           0       0.81      0.89      0.85       154
           1       0.83      0.71      0.76       114

    accuracy                           0.81       268
   macro avg       0.82      0.80      0.80       268
weighted avg       0.81      0.81      0.81       268

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int64
dtypes: int64(2)
memory usage: 6.7 KB




In [120]:
# Decision Trees Output
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
ttree = DecisionTreeClassifier()
ttree.fit(x_train, y_train)
tree_predictions = ttree.predict(x_test)
print(classification_report(y_test, tree_predictions))
tree_accuracy = round(ttree.score(x_train, y_train)*100, 2)
tree_test_pred = ttree.predict(test)
final_data = pd.DataFrame({'PassengerId' : test_pid, 'Survived' : tree_test_pred})
final_data.info()
final_data.to_csv('D:\\git_work\\git_repos\\data_analysis_and_visualization\\titanic_tree_pred.csv', index=False, columns = ['PassengerId','Survived'])

              precision    recall  f1-score   support

           0       0.77      0.88      0.82       154
           1       0.80      0.64      0.71       114

    accuracy                           0.78       268
   macro avg       0.79      0.76      0.77       268
weighted avg       0.78      0.78      0.78       268

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int64
dtypes: int64(2)
memory usage: 6.7 KB


In [121]:
# Decision Trees Output with Random forests

from sklearn.ensemble import RandomForestClassifier
rf_tree = RandomForestClassifier(n_estimators=10)
rf_tree.fit(x_train, y_train)
rf_predictions = rf_tree.predict(x_test)
print(classification_report(y_test, rf_predictions))
rf_accuracy = round(rf_tree.score(x_train, y_train)*100, 2)
rf_test_pred = ttree.predict(test)
final_data = pd.DataFrame({'PassengerId' : test_pid, 'Survived' : rf_test_pred})
final_data.info()
final_data.to_csv('D:\\git_work\\git_repos\\data_analysis_and_visualization\\titanic_tree_rf.csv', index=False, columns = ['PassengerId','Survived'])

              precision    recall  f1-score   support

           0       0.81      0.90      0.85       154
           1       0.84      0.72      0.77       114

    accuracy                           0.82       268
   macro avg       0.82      0.81      0.81       268
weighted avg       0.82      0.82      0.82       268

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int64
dtypes: int64(2)
memory usage: 6.7 KB


In [122]:
# Knn method
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train, y_train)
knn_predictions = knn.predict(x_test)
print(classification_report(y_test, knn_predictions))
knn_accuracy = round(knn.score(x_train, y_train)*100, 2)
knn_test_pred = ttree.predict(test)
final_data = pd.DataFrame({'PassengerId' : test_pid, 'Survived' : knn_test_pred})
final_data.info()
final_data.to_csv('D:\\git_work\\git_repos\\data_analysis_and_visualization\\titanic_knn_pred.csv', index=False, columns = ['PassengerId','Survived'])

              precision    recall  f1-score   support

           0       0.77      0.90      0.83       154
           1       0.83      0.64      0.72       114

    accuracy                           0.79       268
   macro avg       0.80      0.77      0.78       268
weighted avg       0.80      0.79      0.79       268

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int64
dtypes: int64(2)
memory usage: 6.7 KB
