In [269]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pylab as plt
import math

from sklearn.linear_model import LogisticRegression

%matplotlib inline
pd.options.display.max_columns = None

In [377]:
test_raw = pd.read_csv("test.csv")
train_raw = pd.read_csv("train.csv")

In [378]:
train_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [379]:
def get_age(row):
    
    if row['Sex'] == 'female':
        if row['Age'] != row['Age']:
            if "Miss." in row['Name']:
                return 20
            elif "Mrs." in row["Name"]:
                return 35
    elif row['Sex'] == 'male':
        if row['Age'] != row['Age']:
            if "Master." in row['Name']:
                return 10

    return row['Age']

def clean_df(data):
    print("Raw Rows: ", len(data))

    data["Age"] = data.apply(get_age , axis=1)

    data['Age'].fillna(data['Age'].mean(), inplace=True)
    data['Fare'].fillna(0, inplace=True)
    print("Clean Rows: ", len(data))
    return data

def expand_columns(df):
        
    df["is_female"] = df.apply(lambda row: row['Sex'] == 'female', axis=1)
    
    df["is_young"] = df.apply(lambda row: row['Age'] <= 18, axis=1)
    df["is_elder"] = df.apply(lambda row: row['Age'] >= 50, axis=1)
    
    df["is_cherbourg"] = df.apply(lambda row: row['Embarked'] == 'C', axis=1)
    df["is_queenstown"] = df.apply(lambda row: row['Embarked'] == 'Q', axis=1)
    df["is_southampton"] = df.apply(lambda row: row['Embarked'] == 'S', axis=1)
    
    df["is_first"] = df.apply(lambda row: row['Pclass'] == 1, axis=1)
    df["is_second"] = df.apply(lambda row: row['Pclass'] == 2, axis=1)
    df["is_third"] = df.apply(lambda row: row['Pclass'] == 3, axis=1)

    df["had_cabin"] = df.apply(lambda row: row['Cabin'] == row['Cabin'], axis=1)
    
    df["fare2"] = df["Fare"] * df["Fare"]
    
    return df

train = expand_columns(clean_df(train_raw))
test  = expand_columns(clean_df(test_raw))

train.head()

Raw Rows:  891
Clean Rows:  891
Raw Rows:  418
Clean Rows:  418


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_female,is_young,is_elder,is_cherbourg,is_queenstown,is_southampton,is_first,is_second,is_third,had_cabin,fare2
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,False,False,False,False,False,True,False,False,True,False,52.5625
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,True,False,False,True,False,False,True,False,False,True,5081.308859
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,True,False,False,False,False,True,False,False,True,False,62.805625
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,True,False,False,False,False,True,True,False,False,True,2819.61
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,False,False,False,False,False,True,False,False,True,False,64.8025


In [380]:
formula = "Survived ~ is_female + is_young + is_elder + is_cherbourg + is_queenstown + is_southampton" \
            " + is_first + is_second + is_third + had_cabin + fare2 -1"
lm = smf.ols(formula = formula, data = train).fit()
lm.summary()

0,1,2,3
Dep. Variable:,Survived,R-squared:,0.393
Model:,OLS,Adj. R-squared:,0.386
Method:,Least Squares,F-statistic:,56.97
Date:,"Wed, 22 Jul 2020",Prob (F-statistic):,1.58e-88
Time:,21:37:50,Log-Likelihood:,-399.58
No. Observations:,891,AIC:,821.2
Df Residuals:,880,BIC:,873.9
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
is_female[False],0.1070,0.165,0.650,0.516,-0.216,0.430
is_female[True],0.5878,0.164,3.595,0.000,0.267,0.909
is_young[T.True],0.1085,0.036,3.026,0.003,0.038,0.179
is_elder[T.True],-0.1130,0.049,-2.324,0.020,-0.208,-0.018
is_cherbourg[T.True],-0.0654,0.273,-0.240,0.810,-0.601,0.470
is_queenstown[T.True],-0.0699,0.276,-0.253,0.800,-0.611,0.471
is_southampton[T.True],-0.1521,0.272,-0.559,0.577,-0.687,0.382
is_first[T.True],0.3051,0.115,2.649,0.008,0.079,0.531
is_second[T.True],0.2903,0.112,2.590,0.010,0.070,0.510

0,1,2,3
Omnibus:,38.061,Durbin-Watson:,1.908
Prob(Omnibus):,0.0,Jarque-Bera (JB):,41.927
Skew:,0.527,Prob(JB):,7.86e-10
Kurtosis:,3.143,Cond. No.,7.36e+18


In [386]:
logit = LogisticRegression(max_iter=300)

X_train = np.c_[train["is_young"], train["is_elder"], train['Age'],
              train["is_female"], 
              train["is_first"], train["is_second"], train["is_third"],
              train["is_cherbourg"], train["is_queenstown"], train["is_southampton"],
              train["had_cabin"]
             ]
X_test = np.c_[test["is_young"], test["is_elder"],
              test["is_female"], 
              test["is_first"], test["is_second"], test["is_third"],
              test["is_cherbourg"], test["is_queenstown"], test["is_southampton"],
              test["had_cabin"]
             ]
Y_train= train["Survived"]
logit.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [387]:
pred_train = logit.predict(X_train)

# 0.8159371492704826
accuracy = sum(a == b for a,b in zip(pred_train, Y_train))/len(Y_train)
print("Accuracy on Training Set: ", accuracy)

Accuracy on Training Set:  0.8024691358024691


In [388]:
train[train['Survived'] != pred_train].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_female,is_young,is_elder,is_cherbourg,is_queenstown,is_southampton,is_first,is_second,is_third,had_cabin,fare2
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,True,False,False,False,False,True,False,False,True,False,123.950369
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,,S,True,True,False,False,False,True,False,False,True,False,61.688458
17,18,1,2,"Williams, Mr. Charles Eugene",male,29.260921,0,0,244373,13.0,,S,False,False,False,False,False,True,False,True,False,False,169.0
21,22,1,2,"Beesley, Mr. Lawrence",male,34.0,0,0,248698,13.0,D56,S,False,False,False,False,False,True,False,True,False,True,169.0
23,24,1,1,"Sloper, Mr. William Thompson",male,28.0,0,0,113788,35.5,A6,S,False,False,False,False,False,True,True,False,False,True,1260.25


In [257]:
pred_test = logit.predict(X_test)
pred_test

#PassengerId,Survived
test_res = test[["PassengerId"]].copy()
test_res["Survived"] = pred_test
test_res.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [258]:
test_res.to_csv("my_predictions.csv", index=False)

In [369]:
train_raw[(train_raw['Sex']=='female')&(train_raw['Name'].str.contains("Mrs."))]['Age'].mean()
train_raw[(train_raw['Sex']=='female')&(train_raw['Name'].str.contains("Miss"))]['Age'].mean()

20.434065934065934

In [383]:
train[train['Name'] == 'Moubarek, Master. Gerios']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_female,is_young,is_elder,is_cherbourg,is_queenstown,is_southampton,is_first,is_second,is_third,had_cabin,fare2
65,66,1,3,"Moubarek, Master. Gerios",male,10.0,1,1,2661,15.2458,,C,False,True,False,True,False,False,False,False,True,False,232.434418
