In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('tweet_disaster/train.csv')
df = df[['text', 'target']]

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Print the first few rows of the training set
train_df.head()

# Print the first few rows of the validation set
val_df.head()

Unnamed: 0,text,target
2644,So you have a new weapon that can cause un-ima...,1
2227,The f$&amp;@ing things I do for #GISHWHES Just...,0
5448,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1
132,Aftershock back to school kick off was great. ...,0
6845,in response to trauma Children of Addicts deve...,0


In [3]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


class DataPreprocessor:
    def __init__(self):
        self.stopwords = set(stopwords.words('english'))
        self.length = 0
    
    def preprocess_text(self, text):
        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Tokenize text
        tokens = word_tokenize(text)
        
        # Remove stopwords
        tokens = [token for token in tokens if token.lower() not in self.stopwords]
        
        # Convert tokens to lowercase
        tokens = [token.lower() for token in tokens]
        
        self.length += len(tokens)

        return tokens
    
    def preprocess_dataframe(self, df):
        df['preprocessed_text'] = df['text'].apply(self.preprocess_text)
        return df

train_df = DataPreprocessor().preprocess_dataframe(train_df)
val_df = DataPreprocessor().preprocess_dataframe(val_df)

print(train_df['preprocessed_text'].iloc[0])
print(len(train_df))
train_df.head()

['courageous', 'honest', 'analysis', 'need', 'use', 'atomic', 'bomb', '1945', 'hiroshima70', 'japanese', 'military', 'refused', 'surrender']
6090


Unnamed: 0,text,target,preprocessed_text
4996,Courageous and honest analysis of need to use ...,1,"[courageous, honest, analysis, need, use, atom..."
3263,@ZachZaidman @670TheScore wld b a shame if tha...,0,"[zachzaidman, 670thescore, wld, b, shame, golf..."
4907,Tell @BarackObama to rescind medals of 'honor'...,1,"[tell, barackobama, rescind, medals, honor, gi..."
2855,Worried about how the CA drought might affect ...,1,"[worried, ca, drought, might, affect, extreme,..."
4716,@YoungHeroesID Lava Blast &amp; Power Red #Pan...,0,"[youngheroesid, lava, blast, amp, power, red, ..."


In [4]:
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
    
    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
    
    def __len__(self):
        return len(self.word2idx)
    
    def build_vocab(self, df):
        for index, row in df.iterrows():
            for word in row['preprocessed_text']:
                self.add_word(word)
    
    def transform_text(self, words):
        tokens = []
        for word in words:
            if word in self.word2idx:
                tokens.append(self.word2idx[word])
        return tokens
    
    def transform_df(self, df):
        df['transformed_text'] = df['text'].apply(self.transform_text)
        return df
    

vocab = Vocabulary()
vocab.build_vocab(train_df)
vocab.build_vocab(val_df)

# print(f"Length of vocabulary: {len(vocab)}")
# print(f"Most common words: {vocab.word2idx}")

train_df = vocab.transform_df(train_df)
val_df = vocab.transform_df(val_df)

class Padder:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
    
    def fit(self, df):
        self.max_len = 0
        for index, row in df.iterrows():
            if len(row['transformed_text']) > self.max_len:
                self.max_len = len(row['transformed_text'])
    
    def transform(self, df):
        transformed_text = []
        for index, row in df.iterrows():
            text = row['transformed_text']
            if len(text) < self.max_len:
                text = np.append(text, [self.pad_idx] * (self.max_len - len(text)))
            else:
                text = text[:self.max_len]
            # print(len(text), self.max_len)
            transformed_text.append(text)
        df['padded_text'] = transformed_text
        return df
    


# Pad the sequences in the dataframe
pad_idx = len(vocab)
padder = Padder(pad_idx)
padder.fit(train_df)
padder.fit(val_df)
train_df = padder.transform(train_df)
val_df = padder.transform(val_df)


In [5]:
input = torch.tensor(train_df['padded_text'].iloc[:2].tolist())
input.shape

embedder = nn.Embedding(len(vocab) + 1, 100)
output = embedder(input)

input.shape, output.shape

  input = torch.tensor(train_df['padded_text'].iloc[:2].tolist())


(torch.Size([2, 69]), torch.Size([2, 69, 100]))

In [6]:
# Dataloader
class DisasterTweetsDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df['padded_text'].iloc[index]
        target = self.df['target'].iloc[index]
        return torch.tensor(text), torch.tensor(target)

train_dataset = DisasterTweetsDataset(train_df)
val_dataset = DisasterTweetsDataset(val_df)


batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

train_dataset[0], train_loader.dataset[0]

((tensor([  233,  3460,  2601,  2239,   233,  1400,  3566,  1400,  2239,  1400,
           6289,  4344,  1400,  2239,  2239,   233,  2239,  1267,    16,  1400,
            289,  1198,   442,    81,  3460,  3566,   290,  3569,  3188,  1400,
           2239,  2239,  6289,  3460,  3460,  2239,  4344,   233,  2239,   233,
           3460,  3460,  2239,  1400,  2239,  3460,  3566,  3188,  1267,  3566,
           3188, 17971, 17971, 17971, 17971, 17971, 17971, 17971, 17971, 17971,
          17971, 17971, 17971, 17971, 17971, 17971, 17971, 17971, 17971],
         dtype=torch.int32),
  tensor(1)),
 (tensor([  233,  3460,  2601,  2239,   233,  1400,  3566,  1400,  2239,  1400,
           6289,  4344,  1400,  2239,  2239,   233,  2239,  1267,    16,  1400,
            289,  1198,   442,    81,  3460,  3566,   290,  3569,  3188,  1400,
           2239,  2239,  6289,  3460,  3460,  2239,  4344,   233,  2239,   233,
           3460,  3460,  2239,  1400,  2239,  3460,  3566,  3188,  1267,  3566,
   

In [11]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden.squeeze(0))

vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.5
num_epochs = 10
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

# Define the optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


The model has 4,107,857 trainable parameters


In [12]:

# Define the training function
def train(model, criterion, optimizer, train_loader, val_loader, num_epochs, device, clip=5):
    # Initialize the running values for printing training loss and validation loss
    running_loss = 0.0
    val_running_loss = 0.0
    
    model.train_losses = []
    # For each epoch
    for epoch in range(num_epochs):
        # Set model to training mode
        model.train()
        
        # For each batch in the dataloader
        for i, (tweets, labels) in enumerate(train_loader):
            # Zero out the gradients
            optimizer.zero_grad()
            
            # Get the outputs
            outputs = model(tweets)
            
            # Calculate the loss
            loss = criterion(outputs.squeeze(), labels)
            
            # Backpropagate the loss
            loss.backward()
            
            # Clip the gradients
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)
            
            # Update the parameters
            optimizer.step()
            
            # Update the running loss
            running_loss += loss.item()
            
            # Print the epoch, batch, loss
            if (i+1) % 100 == 0:
                print('Epoch: [{}/{}],\tStep: [{}/{}],\tLoss: {}'.format(
                    epoch+1, num_epochs, i+1, len(train_loader), loss.item()))
            
            
        
        # Validate the model
        model.eval()
        
        # For each batch in the dataloader
        for i, (tweets, labels) in enumerate(val_loader):
            # Get the outputs
            outputs = model(tweets)
            
            # Calculate the loss
            loss = criterion(outputs.squeeze(), labels)

            # calculate accuracy
            pred = torch.round(torch.sigmoid(outputs.squeeze()))
            correct_tensor = pred.eq(labels.view_as(pred))
            correct = np.squeeze(correct_tensor.numpy())
            # calculate test accuracy for each object class
            # for i in range(len(labels)):
            #     label = labels[i]
            #     class_correct[label] += correct[i].item()

            
            # Update the running loss
            val_running_loss += loss.item()
        
        # Print the epoch, training loss, validation loss
        print('Epoch: [{}/{}],\tTraining Loss: {},\tValidation Loss: {},\tAccuracy: {}'.format(
            epoch+1, num_epochs, running_loss/len(train_loader), val_running_loss/len(val_loader), np.mean(correct)))
        
        # Append the training loss and validation loss
        model.train_losses.append(running_loss/len(train_loader))
        model.val_losses.append(val_running_loss/len(val_loader))
        
        # Reset the running loss
        running_loss = 0.0
        val_running_loss = 0.0


# Train the model
train(model, criterion, optimizer, train_loader, val_loader, num_epochs, device)


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.DoubleTensor instead (while checking arguments for embedding)

In [None]:
# Plot the training and validation losses
plt.plot(model.train_losses, label='Training loss')
plt.plot(model.val_losses, label='Validation loss')
plt.legend()
plt.show()

In [None]:
# precision on validation set
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

def evaluate(model, val_loader):
    y_true = []
    y_pred = []
    model.eval()
    for i, (tweets, labels) in enumerate(val_loader):
        outputs = model(tweets)
        y_pred.extend(torch.round(torch.sigmoid(outputs.squeeze())).tolist())
        y_true.extend(labels.tolist())
    print('precision: {}'.format(precision_score(y_true, y_pred, zero_division=1)))
    print('recall: {}'.format(recall_score(y_true, y_pred)))
    print('f1: {}'.format(f1_score(y_true, y_pred)))
    print('confusion matrix: {}'.format(confusion_matrix(y_true, y_pred)))

evaluate(model, train_loader)
evaluate(model, val_loader)

In [None]:

# Define the test dataset
test_df = pd.read_csv('tweet_disaster/test.csv')
test_df = test_df[['text']]
test_df = DataPreprocessor().preprocess_dataframe(test_df)
test_df = vocab.transform_df(test_df)
test_df = pad_sequences(test_df, max_len)
test_vectors = torch.FloatTensor(np.array(test_df['padded_text'].tolist()))

# Make predictions on the test dataset
with torch.no_grad():
    predictions = model(test_vectors)
    predictions = np.round(torch.sigmoid(predictions).numpy()).astype(int).reshape(-1)

# Create a dataframe with the tweet ids and their predictions
submission_df = pd.read_csv('tweet_disaster/sample_submission.csv')
submission_df['target'] = predictions
submission_df.head()

