In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# load data
train_data = pd.read_csv("train.csv")

In [None]:
# get information what is inside the train data
train_data.head()

In [None]:
#whether the data is null, from the figure, we know that there is a lot of missingdata in cabin column, and 20% missing
# data in age column
sns.heatmap(train_data.isnull(), yticklabels = False, cbar = False, cmap = "viridis")

In [None]:
sns.set_style("whitegrid")

In [None]:
# see the amount of survived people and unsurvived people based on sex
sns.countplot(x="Survived", hue = "Sex", data = train_data)

In [None]:
# see the amount of survived people and unsurvived people based on passenger class
sns.countplot(x="Survived", hue = "Pclass", data = train_data)

In [None]:
# see the age distribution
sns.displot(train_data["Age"].dropna(), kde = False, bins = 30)

In [None]:
train_data.info()

In [None]:
sns.countplot(x = "SibSp" ,data = train_data)

In [None]:
sns.displot(train_data["Fare"], bins = 40)

In [None]:
plt.figure(figsize = (10,7))
sns.boxplot(x = "Pclass", y = "Age", data = train_data)

In [None]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [None]:
train_data["Age"] = train_data[["Age", "Pclass"]].apply(impute_age, axis = 1)

In [None]:
train_data.info()

In [None]:
sns.heatmap(train_data.isnull(), yticklabels = False, cbar = False)

In [None]:
train_data.drop("Cabin", axis = 1 ,inplace = True)

In [None]:
train_data.head()

In [None]:
train_data.dropna(inplace = True)

In [None]:
sex = pd.get_dummies(train_data["Sex"], drop_first = True)

In [None]:
sex.head()

In [None]:
embark = pd.get_dummies(train_data["Embarked"],drop_first = True)

In [None]:
train_data = pd.concat([train_data, sex, embark], axis = 1)

In [None]:
train_data.head()

In [None]:
train_data.drop(["Sex", "Embarked", "Name", "Ticket"], axis = 1, inplace = True)

In [None]:
train_data.head()

In [None]:
train_data.drop("PassengerId", axis = True, inplace = True)

In [None]:
train_data.head()

In [None]:
# train data
x = train_data.drop("Survived", axis = 1)
y = train_data["Survived"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)

In [None]:
#create model
from sklearn.linear_model import LogisticRegression

In [None]:
logModel = LogisticRegression()

In [None]:
logModel.fit(X_train, y_train)

In [None]:
prediction = logModel.predict(X_test)
print(prediction)

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve

In [None]:
predict_prob = logModel.predict_proba(X_test)

In [None]:
auc = roc_auc_score(y_test, predict_prob[:, 1])
print(auc)

In [None]:
fpr, tpr, threshold = roc_curve(y_test, predict_prob[:, 1])
plt.plot(fpr, tpr, color = "orange", label = "ROC")
plt.plot([0,1], [0, 1], color = "green", linestyle='--')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()

In [None]:
# l1 regulazation
logModel_l1 = LogisticRegression(penalty = "l1", C = 4, solver = "liblinear")


In [None]:
# use cross validation to find the best lambda for this model
from sklearn.model_selection import cross_val_score
scores = cross_val_score(logModel_l1, X_train, y_train, cv=5)
print("the accuracy is ", np.mean(scores))

In [None]:
logModel_l1.fit(X_train_l1, y_train)

In [None]:
prediction_proba_l1 = logModel_l1.predict_proba(X_test_l1)

In [None]:
auc_l1 = roc_auc_score(y_test, prediction_proba_l1[:, 1])

In [None]:
print("auc after l1 regilazation is ", auc_l1)

In [None]:
# l2 regulazation
logModel_l2 = LogisticRegression(penalty = "l2", C = 4, solver = "liblinear")

In [None]:
# use cross validation to find the best lambda for this model
scores_l2 = cross_val_score(logModel_l2, X_train, y_train, cv=5)
print("the accuracy for logistic regression for ridge penalty is ", np.mean(scores_l2))

In [None]:
logModel_l2.fit(X_train, y_train)

In [None]:
prediction_proba_l2 = logModel_l2.predict_proba(X_test)

In [None]:
auc_l2 = roc_auc_score(y_test, prediction_proba_l2[:, 1])

In [None]:
print("auc after l2 regilazation is ", auc_l2)