In [3]:
import numpy as np 
import pandas as pd 
import statsmodels.api as sm
import sweetviz as sv

from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB 
from sklearn.neural_network import MLPClassifier

## Exploring Data

| Variable | Definition |
|----------|----------|
|Survived|Survival/ 0 = No, 1 = Yes|
|Pclass	|Ticket class/ 1 = 1st, 2 = 2nd, 3 = 3rd|
|Sex	|Sex|
|Age	|Age in years|
|Sibsp	|# of siblings / spouses aboard the Titanic|
|Parch	|# of parents / children aboard the Titanic|
|Ticket	|Ticket number|
|Embarked|Port of Embarkation/ C = Cherbourg, Q = Queenstown, S = Southampton|


In [28]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_y = pd.read_csv('gender_submission.csv')
test_y

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [29]:
test_y_new = test_y.loc[test_y['PassengerId'] > 891]
test_y = test_y_new['Survived']
test_y.head(3)

0    0
1    1
2    0
Name: Survived, dtype: int64

## Analyzing the Data

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
print(train.nunique())
train.dtypes

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [48]:
# my_report = sv.analyze(df, target_feat = "granted")
# my_report.show_html()

## Interactions

In [49]:
y_var = 'Survived'
interac =train
interac.drop(['Name', 'Cabin', 'Ticket'], inplace=True, axis=1)
interac = pd.get_dummies(interac, drop_first=True)

In [50]:
import itertools
for x in itertools.combinations(interac.columns.drop(y_var), 2):
    interac[f'{x[0]}_{x[1]}'] = interac[x[0]]*interac[x[1]]

### Clean useless interactions

In [51]:
interac = interac.drop(interac.columns[interac.nunique() == 1].tolist(),axis=1)
interac = interac.dropna()

## Apply Regression

In [14]:
import statsmodels.api as sm
Xtrain = interac.drop(y_var,axis=1).astype(float)
Xtrain['const'] = 1
ytrain = interac[[y_var]]
   
model = sm.OLS(ytrain, Xtrain).fit(max_iter=1000)
model.summary()

0,1,2,3
Dep. Variable:,Survived,R-squared:,0.479
Model:,OLS,Adj. R-squared:,0.445
Method:,Least Squares,F-statistic:,14.0
Date:,"Sat, 29 Oct 2022",Prob (F-statistic):,1.24e-68
Time:,18:11:27,Log-Likelihood:,-272.37
No. Observations:,714,AIC:,634.7
Df Residuals:,669,BIC:,840.4
Df Model:,44,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
PassengerId,0.0004,0.000,1.407,0.160,-0.000,0.001
Pclass,-0.2174,0.091,-2.392,0.017,-0.396,-0.039
Age,-0.0084,0.006,-1.502,0.134,-0.019,0.003
SibSp,0.2631,0.151,1.745,0.082,-0.033,0.559
Parch,0.2774,0.111,2.498,0.013,0.059,0.495
Fare,-0.0051,0.002,-2.474,0.014,-0.009,-0.001
Sex_male,-0.6536,0.183,-3.566,0.000,-1.013,-0.294
Embarked_Q,0.6401,1.106,0.579,0.563,-1.532,2.813
Embarked_S,-0.1086,0.213,-0.511,0.610,-0.526,0.309

0,1,2,3
Omnibus:,53.474,Durbin-Watson:,1.886
Prob(Omnibus):,0.0,Jarque-Bera (JB):,63.688
Skew:,0.699,Prob(JB):,1.48e-14
Kurtosis:,3.434,Cond. No.,2950000.0


In [15]:
top_features = Xtrain.columns[model.pvalues < 0.06].drop('const',errors='ignore').to_list()
top_features

['Pclass',
 'Parch',
 'Fare',
 'Sex_male',
 'PassengerId_Embarked_Q',
 'Pclass_SibSp',
 'Pclass_Sex_male',
 'Age_Parch',
 'Age_Fare',
 'Age_Sex_male',
 'Age_Embarked_S']

In [17]:
Xtrain = interac[top_features].astype(float)
Xtrain['const'] = 1
ytrain = interac[[y_var]]
   
model = sm.OLS(ytrain, Xtrain).fit()
model.summary()

0,1,2,3
Dep. Variable:,Survived,R-squared:,0.44
Model:,OLS,Adj. R-squared:,0.431
Method:,Least Squares,F-statistic:,50.06
Date:,"Sat, 29 Oct 2022",Prob (F-statistic):,7.61e-81
Time:,18:15:25,Log-Likelihood:,-298.69
No. Observations:,714,AIC:,621.4
Df Residuals:,702,BIC:,676.2
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Pclass,-0.2261,0.033,-6.937,0.000,-0.290,-0.162
Parch,0.1966,0.044,4.447,0.000,0.110,0.283
Fare,-0.0021,0.001,-2.650,0.008,-0.004,-0.001
Sex_male,-0.4949,0.109,-4.530,0.000,-0.709,-0.280
PassengerId_Embarked_Q,-0.0003,0.000,-1.683,0.093,-0.001,4.26e-05
Pclass_SibSp,-0.0327,0.007,-4.983,0.000,-0.046,-0.020
Pclass_Sex_male,0.0919,0.037,2.466,0.014,0.019,0.165
Age_Parch,-0.0060,0.001,-4.832,0.000,-0.008,-0.004
Age_Fare,6.831e-05,2.26e-05,3.023,0.003,2.4e-05,0.000

0,1,2,3
Omnibus:,42.224,Durbin-Watson:,1.857
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48.439
Skew:,0.634,Prob(JB):,3.03e-11
Kurtosis:,3.134,Cond. No.,20900.0


## Remove Outliers

In [52]:
# train = train.loc[(train['Age'] <= 70) & (train['Age'] >= 1)]

### Formating Data

In [16]:
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(test['Age'].mean())
test['Fare'] = test['Fare'].fillna(0)

In [17]:
train_x = train[['PassengerId','Sex','Pclass', 'Age', 'Fare', 'Embarked', 'Parch', 'SibSp']] 
train_y = train['Survived']

test_x = test[['PassengerId','Sex','Pclass',  'Age', 'Fare', 'Embarked', 'Parch', 'SibSp']]


In [18]:
y_var = 'Survived'
train_x = pd.get_dummies(train_x,drop_first=True)
test_x = pd.get_dummies(test_x,drop_first=True)

## Feature Engineering

In [30]:
train_x['male_fare'] = 0
train_x['male_age'] = 0
test_x['male_fare'] = 0
test_x['male_age'] = 0


train_x.loc[(train_x['Fare'] < 20) & (train_x['Sex_male'] == 1), 'male_fare'] = 1
test_x.loc[(test_x['Fare'] < 20) & (test_x['Sex_male'] == 1), 'male_fare'] = 1

train_x.loc[(train_x['Age'] > 6) & (train_x['Sex_male'] == 1), 'male_age'] = 1
test_x.loc[(test_x['Age'] > 6) & (test_x['Sex_male'] == 1), 'male_age'] = 1

In [40]:
# print(train_x.count())
print(train_y.count())
# print(test_x.count())
print(test_y.count())
test_y.head()

891
418


0    0
1    1
2    0
3    0
4    1
Name: Survived, dtype: int64

# Models

## Decision Tree 

In [34]:
clf_dec = DecisionTreeClassifier(max_depth=2, min_samples_leaf=20, min_samples_split=0.2)
clf_dec.fit(train_x, train_y)
print(clf_dec.score(test_x, test_y))
print(f1_score(test_y, clf_dec.predict(test_x)))

0.8229665071770335
0.6837606837606838


In [35]:
pd.Series(
    clf_dec.feature_importances_,
    clf_dec.feature_names_in_
).sort_values().to_frame("Importance")

Unnamed: 0,Importance
PassengerId,0.0
Age,0.0
Fare,0.0
Parch,0.0
SibSp,0.0
Sex_male,0.0
Embarked_Q,0.0
Embarked_S,0.0
male_fare,0.0
Pclass,0.253817


In [38]:
clf_ln = LogisticRegression()
clf_ln.fit(train_x, train_y)
print(clf_ln.score(test_x, test_y))
print(f1_score(test_y, clf_ln.predict(test_x)))

0.9593301435406698
0.9453376205787781


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
clf_gnb = GaussianNB()
clf_gnb.fit(train_x, train_y)
print(clf_gnb.score(test_x, test_y)) 
print(f1_score(test_y, clf_gnb.predict(test_x)))

0.9330143540669856
0.9135802469135803
