<a href="https://colab.research.google.com/github/mori8/NLP-Pytorch-practice/blob/main/Chapter_8_DataLoader_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import torch
import numpy as np
import pandas as pd
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [85]:
class TitanicDataset(Dataset):
  def __init__(self, mode='train'):
    self.mode = mode
    # Initialize your data, download, etc
    d = pd.read_csv('/content/drive/MyDrive/NLP-Pytorch/data/titanic_train.csv')
    self.len = d.shape[0]

    d['Initial'] = 0
    for i in d:
      d['Initial'] = d.Name.str.extract('([A-Za-z]+)\.')  # lets extract the Salutations
    d['Initial'].replace(
      ['Mlle', 'Mme', 'Ms', 'Dr', 'Major', 'Lady', 'Countess', 'Jonkheer', 'Col', 'Rev', 'Dona', 'Capt', 'Sir', 'Don'],
      ['Miss', 'Miss', 'Miss', 'Mr', 'Mr', 'Mrs', 'Mrs', 'Other', 'Other', 'Other', 'Other', 'Mr', 'Mr', 'Mr'], inplace=True)

    # print(d.groupby('Initial')['Age'].mean())

    d.loc[(d.Age.isnull()) & (d.Initial == 'Mr'), 'Age'] = 32
    d.loc[(d.Age.isnull()) & (d.Initial == 'Mrs'), 'Age'] = 39
    d.loc[(d.Age.isnull()) & (d.Initial == 'Master'), 'Age'] = 7
    d.loc[(d.Age.isnull()) & (d.Initial == 'Miss'), 'Age'] = 22
    d.loc[(d.Age.isnull()) & (d.Initial == 'Other'), 'Age'] = 42

    d['Age_band'] = 0
    d.loc[d['Age'] <= 16, 'Age_band'] = 0
    d.loc[(d['Age'] > 16) & (d['Age'] <= 32), 'Age_band'] = 1
    d.loc[(d['Age'] > 32) & (d['Age'] <= 48), 'Age_band'] = 2
    d.loc[(d['Age'] > 48) & (d['Age'] <= 64), 'Age_band'] = 3
    d.loc[d['Age'] > 64, 'Age_band'] = 4

    d['Embarked'].fillna('S', inplace=True)

    d['Family_Size'] = 0
    d['Family_Size'] = d['Parch'] + d['SibSp']  # family size
    d['Alone'] = 0
    d.loc[d.Family_Size == 0, 'Alone'] = 1  # Alone

    d['Fare_cat'] = 0
    d.loc[d['Fare'] <= 7.91, 'Fare_cat'] = 0
    d.loc[(d['Fare'] > 7.91) & (d['Fare'] <= 14.454), 'Fare_cat'] = 1
    d.loc[(d['Fare'] > 14.454) & (d['Fare'] <= 31), 'Fare_cat'] = 2
    d.loc[(d['Fare'] > 31) & (d['Fare'] <= 513), 'Fare_cat'] = 3

    d['Sex'].replace(['male', 'female'], [0, 1], inplace=True)
    d['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)
    d['Initial'].replace(['Mr', 'Mrs', 'Miss', 'Master', 'Other'], [0, 1, 2, 3, 4], inplace=True)

    d.drop(['Name', 'Age', 'Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)
    print(d.columns)
    
    if self.mode == 'train':
      self.x_data = torch.tensor(d[d.columns[1:]].values, dtype=torch.float32)
      self.y_data = torch.tensor(d['Survived'].values, dtype=torch.long)
    else:
      self.x_data = torch.tensor(d.values, dtype=torch.float32)

  def __getitem__(self, index):
    # return one item on the index
    if self.mode == 'train':
      return self.x_data[index], self.y_data[index]
    else:
      return self.x_data[index]
  
  def __len__(self):
    # return the data length
    return self.len

In [86]:
dataset = TitanicDataset()
train_loader = DataLoader(dataset=dataset,
                          batch_size=64,
                          shuffle=True,
                          num_workers=2)

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'SibSp', 'Parch',
       'Embarked', 'Initial', 'Age_band', 'Family_Size', 'Alone', 'Fare_cat'],
      dtype='object')


In [87]:
import torch.nn as nn
import torch.nn.functional as F

class Classifier(nn.Module):
  def __init__(self):
    super(Classifier, self).__init__()
    self.fc1 = nn.Linear(11, 512)
    self.fc2 = nn.Linear(512, 1024)
    self.fc3 = nn.Linear(1024, 512)
    self.fc4 = nn.Linear(512, 1)
    self.dropout = nn.Dropout(0.5)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.dropout(x)
    x = F.relu(self.fc2(x))
    x = self.dropout(x)
    x = F.relu(self.fc3(x))
    x - self.dropout(x)
    y_pred = F.sigmoid(self.fc4(x))
    
    return y_pred

In [90]:
model = Classifier()
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [100]:
for epoch in range(2):
  for i, data in enumerate(train_loader, 0):
    # get the inputs
    inputs, labels = data

    # wrap them in Variable
    inputs, labels = Variable(inputs), Variable(labels)

    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(inputs)
    labels = labels.unsqueeze(1).type(torch.float32)
    # Compute and print loss
    loss = criterion(y_pred, labels)
    print(epoch, i, loss.data.item())

    # Zero gradients, perform a backward pass, and update the weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()



0 0 0.6575018763542175
0 1 0.6522390246391296
0 2 0.6644057631492615
0 3 0.6821078062057495
0 4 0.6469085812568665
0 5 0.6539075374603271
0 6 0.6867501139640808
0 7 0.6553294658660889
0 8 0.6650415062904358
0 9 0.6595397591590881
0 10 0.6599054932594299
0 11 0.6363183856010437
0 12 0.6575401425361633
0 13 0.6755999326705933
1 0 0.6281698346138
1 1 0.6391613483428955
1 2 0.6656372547149658
1 3 0.6362169981002808
1 4 0.6694052815437317
1 5 0.619015634059906
1 6 0.6954407691955566
1 7 0.667020857334137
1 8 0.6571271419525146
1 9 0.6468816995620728
1 10 0.668258786201477
1 11 0.6553488969802856
1 12 0.6316803097724915
1 13 0.6419953107833862
