We followed the guide about using CNNs for Sentence Classification from Chris Tran (https://chriskhanhtran.github.io/posts/cnn-sentence-classification/). Thanks to Jesica Ramírez for her recommendation and guidance. 

In [167]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing as pp
import torch
import random

In [169]:
SEED = 30255 # Specify a seed for reproducability

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [170]:
df = pd.read_csv('df_clean.csv')

In [171]:
df.sentimentnum.unique()

array([2, 1, 0], dtype=int64)

In [172]:
print('Max number of words for a given tweet in the Dataset:')
max_len = df.Text.apply(lambda x: len(x.split())).max()
print(max_len)

Max number of words for a given tweet in the Dataset:
68


In [173]:
# Balancing classes
from sklearn.model_selection import train_test_split

positive = df[df['sentimentnum'] == 1]
negative = df[df['sentimentnum'] == 2]
neutral = df[df['sentimentnum'] == 0]


positive, rejected = train_test_split(positive,
                                             train_size=(neutral.shape[0]/
                                                         positive.shape[0]),random_state=0)
negative, rejected = train_test_split(negative,
                                             train_size=(neutral.shape[0]/
                                                         negative.shape[0]),random_state=0)

data = positive.append(negative)

In [174]:
data.groupby('sentimentclass').size()


sentimentclass
negative    42680
positive    42680
dtype: int64

In [175]:
'''
dataclassCNN.py
File to create a Custom Data Class and Collate Function for PyTorch.
This file is for the CNN model.
'''
from torch.utils.data import Dataset
from torchtext.data.utils import get_tokenizer


class ProjectDataset(Dataset):

    def __init__(self, data, target_col=None, text_col=None):

        # Target first, then Inputs.
        self.samples = []
        tokenizer = get_tokenizer('basic_english')

        if not target_col and not text_col:
            targets = list(data[0])
            inputs = list(data[1])
            for idx in range(len(targets)):
                text = tokenizer(inputs[idx])
                self.samples.append([targets[idx], text])
        else:
            for _, row in data.iterrows():
                text = row[text_col]
                text = tokenizer(text)
                target = row[target_col]
                self.samples.append([target, text])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [176]:
data_obj = ProjectDataset(data, 'sentimentnum', 'Text')

In [177]:
data_obj

<__main__.ProjectDataset at 0x1472081abc8>

In [178]:
from collections import Counter
from torchtext.vocab import Vocab


counter_words = Counter()
for (label, text) in data_obj:
    counter_words.update(text)
    
vocab_words = Vocab(counter_words)

print('The vocabulary length is {} words'.format(len(vocab_words)))

The vocabulary length is 46655 words


In [181]:
def collate_for_cnn(batch, max_len=max_len):
    '''
    For each batch, develop the appropiate inputs from the models (i.e. embeddings)
    '''
    input_vector = []
    labels = []
    for i, (label, tokenized_sent) in enumerate(batch):
        
        #Append labels 
        labels.append(label)
        
        #Get tokenized sentence
        diff =  (max_len - len(tokenized_sent))
        tokenized_sent += ['<pad>'] * diff
        
       
        assert(len(tokenized_sent) == max_len), tokenized_sent
        
        wordstoidx = [vocab_words.stoi[w] for w in tokenized_sent]
        input_vector.append(wordstoidx)
    
    return torch.tensor(labels).to(device), torch.tensor(input_vector).to(device)

In [182]:
from torch.utils.data import DataLoader

In [183]:
from sklearn.model_selection import train_test_split

# Train with Validation Data and Test Data Split
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    data['Text'], data['sentimentnum'], test_size=0.1, random_state=42)

#Train and Validation Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.1, random_state=42)

In [184]:
def data_loader(train_tuple, val_tuple, test_tuple,
                batch_size=30):
    """Convert train, validation and test sets into Dataloaders :)
    """

    # Create DataLoader for training data
    train_data = ProjectDataset(train_tuple)
    train_dataloader = DataLoader(train_data, shuffle=True,
                                  batch_size=batch_size, 
                                 collate_fn=collate_for_cnn)
    
    # Create DataLoader for validation data
    valid_data = ProjectDataset(val_tuple)
    val_dataloader = DataLoader(valid_data, shuffle=False,
                                  batch_size=batch_size, 
                                 collate_fn=collate_for_cnn)

    # Create DataLoader for test data
    test_data = ProjectDataset(test_tuple)
    test_dataloader = DataLoader(test_data, shuffle=False,
                                  batch_size=batch_size, 
                                 collate_fn=collate_for_cnn)

    return train_dataloader, val_dataloader, test_dataloader

In [185]:
train_dataloader, val_dataloader, test_dataloader = data_loader([train_labels, train_texts],
                                                               [val_labels, val_texts],
                                                               [test_labels, test_texts])

In [186]:
for i in train_dataloader:
    labels, inputs = i[0], i[1]
    print(inputs.shape)
    break

torch.Size([30, 68])


In [158]:
FILTER_SIZES = [3,4,5]
N_FILTERS = [100, 100,100]
DROPOUT = 0.5
PAD_IDX = vocab_words.stoi['<pad>']
num_classes = 2 #Low, Medium, High

In [159]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_polarization(nn.Module):
    ''' An 1D Convulational Neural Network for Sentence Classification'''
    
    def __init__(self, pretrained_embedding=None, freeze_embedding=False,
                 vocab_size=None, embed_dim=None,
                 filter_sizes=FILTER_SIZES, num_filters=N_FILTERS,
                 num_classes=num_classes,pad_id=PAD_IDX,
                 dropout=DROPOUT):
        """
        pretrained_embedding (torch.Tensor): Pretrained embeddings with
                shape (vocab_size, embed_dim)
            freeze_embedding (bool): Set to False to fine-tune pretraiend
                vectors. Default: False
            vocab_size (int): Need to be specified when not pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (int): Number of filters. Default: 100
            n_classes (int): Number of classes. Default: 3
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN_polarization, self).__init__()
        
        # 1. Embedding layer
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx= pad_id,
                                          )
        
        # 2. Convolutional Layers (for each filter size --> n-gram)
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = self.embed_dim,
                                              out_channels = num_filters[i], 
                                              kernel_size = fs)
                                    for i, fs in enumerate(filter_sizes)
                                    ])
        
    
        # 3. Fully-connected layer
        self.linear = nn.Linear(in_features = np.sum(num_filters), 
                                out_features = num_classes
                               )
        
        
        # Additional Feature
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):

        x_embedded = self.embedding(inputs)
        
        x_embedded = x_embedded.permute(0, 2, 1)
        
        # Apply CNN and ReLU
        convs_list = [F.relu(conv(x_embedded)) for conv in self.convs]

        # Max pooling.
        pooled = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]).squeeze(2)
                  for x_conv in convs_list]
        
        # Concatenate Pool list to feed the fully connected layer
        input_fc = self.dropout(torch.cat(pooled, dim = 1))
        
        # Compute probabilities
        predictions_classes = self.linear(input_fc)
        
        return predictions_classes

In [160]:
def train_an_epoch(model, dataloader, loss_function, optimizer):
    
    model.train() # Sets the module in training mode.
    total_loss = 0
    
    for idx, batch in enumerate(dataloader):
        
        # Load batch to GPU
        labels, inputs = tuple(t.to(device) for t in batch)
        
        # Zero out any previously calculated gradients
        model.zero_grad()
        
        #Perform a forward pass.
        log_probs = model(inputs)
        
        # Compute loss and accumulate the loss values
        loss = loss_function(log_probs, labels.long())
        total_loss += loss.item()
        
        # Perform a backward pass to calculate gradients
        loss.backward()
        optimizer.step()
    
    return total_loss/len(dataloader)

In [161]:
def evaluate(model, dataloader, loss_fn):
    '''
    Evaluate the model on the given data (e.g. validation data or test data).
    '''

    #As we are now using dropout, we must remember to use model.eval() 
    #to ensure the dropout is "turned off" while evaluating.
    model.eval()
    
    total_accuracy = []
    total_loss = []
    
    with torch.no_grad():
        # No gradients need to be maintained during evaluation
        for i, batch in enumerate(dataloader):
            
            # Load batch to Device
            labels, inputs = batch[0], batch[1]
            if USE_CUDA:
                labels, inputs = labels.cuda(), inputs.cuda()
                
            # Obtain probabilities of each class per sentence
            output = model(inputs)
            
            # Compute loss
            loss = loss_fn(output, labels.long())
            total_loss.append(loss.item())

            # Get the predictions
            preds = torch.argmax(output, dim=1).flatten()
            
            # Calculate the accuracy rate
            accuracy = (preds == labels).sum()/len(preds)
            total_accuracy.append(accuracy)
            

    return np.mean(total_loss), np.mean(total_accuracy)
            

In [162]:
import datetime

def train_validate(model, optimizer, train_dataloader, val_dataloader, pretrained_embedding, epochs=20):
    
    """Train the CNN model."""
    
    #Loss function
    loss_function = nn.CrossEntropyLoss()
    
    # Tracking best model
    val_losses = []
    val_accuracies = []
    best_model = None
    
    # Start training loop
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Time(secs)':^8} | {'Train Loss':^9}| {'Val Loss':^10}| {'Val Acc':^11}")
    print("-"*60)

    for epoch_i in range(epochs):
        start_time = datetime.datetime.now()
        # =======================================
        #               Training
        # =======================================
        
        avg_loss_train = train_an_epoch(model, train_dataloader, loss_function, optimizer)
        
        # =======================================
        #               Evaluation
        # =======================================
        
        val_loss, val_accuracy = evaluate(model, val_dataloader, loss_function)

        # Track the best model
        if len(val_losses) == 0 or val_loss < min(val_losses):
            best_model = type(model)(pretrained_embedding=pretrained_embedding,
                                     vocab_size=len(vocab_words), 
                                     embed_dim=model.embed_dim,
                                     filter_sizes=filter_sizes,
                                     num_filters=N_FILTERS, num_classes=num_classes)
            best_model.load_state_dict(model.state_dict())
            if USE_CUDA:
                best_model = best_model.cuda()
        
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        
        time_diff = (datetime.datetime.now() - start_time).seconds
        
        print(f"{epoch_i + 1:^7} |  {time_diff:^8}  | {avg_loss_train:^9.2f} | {val_loss:^10.2f}| {val_accuracy:^11.2f}")
        print('')
        
    return best_model, val_accuracies

In [187]:
num_classes = len(df.sentimentnum.unique())
filter_sizes = [2, 3, 6] # bi-grams, tri-grams, four-grams filters
N_FILTERS = [100,200,100]

In [188]:
cnn_model = CNN_polarization(pretrained_embedding=None,
                 vocab_size=len(vocab_words), embed_dim=400,
                        freeze_embedding=False,
                        filter_sizes=filter_sizes,
                        num_filters=N_FILTERS,
                        num_classes=num_classes,
                        dropout=0.5).to(device)

In [189]:
learning_rate = 0.001
optimizer = torch.optim.Adam(cnn_model.parameters(), 
                                 lr=learning_rate)

In [190]:
best_model, val_accuracies = train_validate(cnn_model, optimizer, 
                                            train_dataloader, val_dataloader, 
                                            pretrained_embedding=None,
                                            epochs=5)

Start training...

 Epoch  | Time(secs) | Train Loss|  Val Loss |   Val Acc  
------------------------------------------------------------
   1    |    791     |   0.50    |    0.32   |    0.86    

   2    |    1019    |   0.31    |    0.37   |    0.85    

   3    |    974     |   0.24    |    0.29   |    0.89    

   4    |    990     |   0.19    |    0.30   |    0.89    

   5    |    981     |   0.15    |    0.36   |    0.88    

