### Download and unzip dataset

In [123]:
!kaggle competitions download -c titanic -p .

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [124]:
import numpy as np
import pandas as pd
import zipfile


In [125]:
with zipfile.ZipFile("titanic.zip", "r") as zip_ref:
    zip_ref.extractall(".")  


In [126]:
train = pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

In [127]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [128]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Preprocessing:
- Extract Title from Name

In [129]:
# extract Title from Name
train['Title'] = train['Name'].apply(lambda x: x.split('.')[0].split(',')[1].strip())
test['Title'] = test['Name'].apply(lambda x: x.split('.')[0].split(',')[1].strip())
train['Title'].value_counts()

Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64

In [130]:
allowed_titles = ['Mr', 'Miss', 'Master', 'Mrs']

# replace uncommon titles with 'Misc'
train['Title'] = train['Title'].apply(lambda x: x if x in allowed_titles else 'Misc')
test['Title'] = test['Title'].apply(lambda x: x if x in allowed_titles else 'Misc')

print(train['Title'].value_counts())
print(test['Title'].value_counts())

Title
Mr        517
Miss      182
Mrs       125
Master     40
Misc       27
Name: count, dtype: int64
Title
Mr        240
Miss       78
Mrs        72
Master     21
Misc        7
Name: count, dtype: int64


- Drop all irrevelant columns
- Map `Sex` to (0,1)
- Fillna `Embarked` with the most frequent value
- Fillna `Fare` with median 
- Fillna `Age` with median based on `Title` group

In [131]:
train.drop(columns=['PassengerId','Ticket','Name','Cabin','SibSp','Parch'],inplace=True)
test.drop(columns=['PassengerId','Ticket','Name','Cabin','SibSp','Parch'],inplace=True)

train["Sex"] = train["Sex"].map({"male": 0, "female": 1})
test["Sex"] = test["Sex"].map({"male": 0, "female": 1})

train["Embarked"] = train["Embarked"].fillna(train["Embarked"].mode()[0])
test["Embarked"] = test["Embarked"].fillna(test["Embarked"].mode()[0])


# train['Fare']=train['Fare'].fillna(train['Fare'].mean())
# test['Fare']=test['Fare'].fillna(train['Fare'].mean())

# title_age_mean = train.groupby('Title')['Age'].mean()
# train['Age'] = train.apply(lambda row: title_age_mean[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)
# title_age_mean = test.groupby('Title')['Age'].mean()
# test['Age'] = test.apply(lambda row: title_age_mean[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)


train['Fare']=train['Fare'].fillna(train['Fare'].median())
test['Fare']=test['Fare'].fillna(train['Fare'].median())
title_age_mean = train.groupby('Title')['Age'].median()
train['Age'] = train.apply(lambda row: title_age_mean[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)
title_age_mean = test.groupby('Title')['Age'].median()
test['Age'] = test.apply(lambda row: title_age_mean[row['Title']] if pd.isna(row['Age']) else row['Age'], axis=1)


- Log the numeric columns so that it would skew the result

In [132]:
numeric_cols = ['Age','Fare']
train[numeric_cols] = np.log1p(train[numeric_cols])
test[numeric_cols] = np.log1p(test[numeric_cols])


- One hot encoding for categorical features.

In [133]:
train = pd.get_dummies(train).astype(float)
test = pd.get_dummies(test).astype(float)

In [134]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,0.0,3.0,0.0,3.135494,2.110213,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,1.0,1.0,3.663562,4.280593,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,3.0,1.0,3.295837,2.188856,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1.0,1.0,1.0,3.583519,3.990834,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,3.0,0.0,3.583519,2.202765,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [135]:
test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Misc,Title_Miss,Title_Mr,Title_Mrs
0,3.0,0.0,3.569533,2.178064,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.0,1.0,3.871201,2.079442,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2.0,0.0,4.143135,2.369075,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3.0,0.0,3.332205,2.268252,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,3.0,1.0,3.135494,2.586824,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


- Split the training dataset to `X_train` and `y_train`
- align `X_train` and `X-test` for correct dimension

In [136]:
X_train = train.drop(columns=['Survived'])
y_train = train['Survived']  

X_train, X_test = X_train.align(test, join='left', axis=1, fill_value=0)

X_train = X_train.values
y_train = y_train.values
X_test = X_test.values


- Define function `fit` to optimize the parameters based on gradient descent 
- Define function `predict` to classify the `test` dataset

In [137]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def fit(X,y, learning_rate=0.01,epochs=1000):
    m, n = X.shape
    weights = np.zeros(n)
    bias=0
    for _ in range(epochs):
        linear_model = np.dot(X, weights) + bias
        predictions = sigmoid(linear_model)
        dw = (1 / m) * np.dot(X.T, (predictions - y))
        db = (1 / m) * np.sum(predictions - y)
        weights -= learning_rate * dw
        bias -= learning_rate * db
    return weights,bias

def predict(X,weights,bias):
    linear_model = np.dot(X, weights) + bias
    predictions = sigmoid(linear_model)
    return (predictions >= 0.5).astype(int)


- Train with `epochs=500000`, reduce if needed 

In [138]:
weights,bias=fit(X_train,y_train,epochs=500000)
print(weights)
print(bias)

[-1.26983973  3.63072777 -0.4137324  -0.17476546  0.93074158  0.89364274
  0.32045106  2.31593002  0.11413606 -0.61773652  0.36084967 -0.02834385]
2.144835369307404


- Write to csv to submit to kaggle.

In [None]:
y_pred=predict(X_test,weights,bias)
submission = pd.DataFrame({'PassengerId': test.index + 892, 'Survived': y_pred})
submission.to_csv('submission.csv', index=False)

: 

### Screenshot here: https://github.com/phuongwhuynh/NLP_Lab/blob/main/Lab67/hw2/screenshot_hw2.png