# Import

In [1]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import random
import os
import nltk
nltk.download('popular')
from spellchecker import SpellChecker
import seaborn as sns
import matplotlib.pyplot as plt
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
from regex import E


torch.manual_seed(1)

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/adam/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/adam/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/adam/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/adam/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/adam/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/adam/nltk_data...
[nltk_data]    |   

## Configuration

In [2]:
# Hyper-parameters 
input_size = 49 # number of features
hidden_size = 200 
num_classes = 3
num_epochs = 10
batch_size = 4
learning_rate = 0.00001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Dataset Class

In [3]:

class FeaturesDataset(Dataset):

    def __init__(self, path=''):
        df_in = pd.read_csv(path)
        df_orig_train_metadata = pd.read_csv('../data/processed/train_w_extracted_features_v2.csv')
        df_in.set_index('discourse_id')
        df_orig_train_metadata.set_index('discourse_id')
        df_merged = pd.concat([df_in, df_orig_train_metadata], join='inner', axis=1)
        
        df_merged = df_merged[df_merged.columns.drop(list(df_merged.filter(regex='intersect')))]
        df_merged = df_merged.fillna(0)
        df_effectiveness = pd.get_dummies(df_in['discourse_effectiveness'])
        #discourse_type_one_hot = pd.get_dummies(df_in['discourse_type'])
        df_merged_with_discourse_type = df_merged
        print(df_effectiveness.columns)
        #df_merged_with_discourse_type[discourse_type_one_hot.columns] = discourse_type_one_hot
        
        df_merged_numeric = df_merged_with_discourse_type.select_dtypes(include=np.number)
        print(df_merged_numeric.shape)
        print(df_merged_numeric.columns)
        
        
        self.x_data = torch.from_numpy(df_merged_numeric.to_numpy().astype(dtype=np.float32)) # size [n_samples, n_features]
        self.y_data = torch.from_numpy(df_effectiveness[['Effective', 'Adequate', 'Ineffective']].to_numpy())
        self.n_samples = self.x_data.shape[0]
        
    # support indexing such that dataset[i] can be used to get i-th sample
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    # we can call len(dataset) to return the size
    def __len__(self):
        return self.n_samples


train_dataset = FeaturesDataset('../data/train_berkeley.csv')
test_dataset = FeaturesDataset('../data/test_berkeley.csv')

# get first sample and unpack
first_data = train_dataset[0]
features, labels = first_data
print(features, labels)

# Load whole dataset with DataLoader
# shuffle: shuffle data, good for training
# num_workers: faster loading with multiple subprocesses
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)

test_loader = DataLoader(dataset=test_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=0)


# convert to an iterator and look at one random sample
train_iter = iter(train_loader)
data = train_iter.next()
features, labels = data
pd.set_option('display.max_colwidth', None)
print('HEREH')
print(features, labels)

test_iter = iter(test_loader)


Index(['Adequate', 'Effective', 'Ineffective'], dtype='object')
(33297, 49)
Index(['Unnamed: 0', 'comma_count', 'NNP', ',', 'NN', 'VBP', 'VBG', 'TO', 'VB',
       'IN', 'WRB', 'DT', 'VBZ', 'JJ', 'CC', 'EX', 'NNS', 'VBD', 'PRP', '.',
       'VBN', 'RB', 'num_tokens', 'misspelled_count', 'PRP$', 'MD', 'CD', '``',
       '''', 'WDT', 'WP', 'POS', 'NNPS', 'JJR', '(', ')', 'RP', ':', 'JJS',
       'RBR', 'WP$', 'PDT', 'UH', '$', 'RBS', 'FW', '#', 'LS', 'SYM'],
      dtype='object')
Index(['Adequate', 'Effective', 'Ineffective'], dtype='object')
(3468, 49)
Index(['Unnamed: 0', 'comma_count', 'NNP', ',', 'NN', 'VBP', 'VBG', 'TO', 'VB',
       'IN', 'WRB', 'DT', 'VBZ', 'JJ', 'CC', 'EX', 'NNS', 'VBD', 'PRP', '.',
       'VBN', 'RB', 'num_tokens', 'misspelled_count', 'PRP$', 'MD', 'CD', '``',
       '''', 'WDT', 'WP', 'POS', 'NNPS', 'JJR', '(', ')', 'RP', ':', 'JJS',
       'RBR', 'WP$', 'PDT', 'UH', '$', 'RBS', 'FW', '#', 'LS', 'SYM'],
      dtype='object')
tensor([ 0.,  3.,  7.,  3., 12.,  2.,

## Model

In [4]:
# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.input_size = input_size         
        self.l1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        return out

model = NeuralNet(input_size, hidden_size, num_classes).to(device)

## Loss & Optimizer

In [5]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss() # it automatically applies the softmax to the output of last layer. 
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)  

## Training

In [6]:

total_samples = len(train_dataset)
n_iterations = math.ceil(total_samples/batch_size)
print("NUMBER OF SAMPLES = ",str(total_samples) + " NUMBER OF ITERATIONS = ",n_iterations)
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        
        # here: 33297 samples, batch_size = 500, n_iters=33297/500=66.5 -> 67 iterations
        # Run the training process        
        
        #Fixing "RuntimeError: 1D target tensor expected, multi-target not supported"
        #print(outputs)
        #print(outputs.shape)
        #print(labels)
        #print(labels.shape)

        #for t in inputs:
        #    print(t.dtype)
        #break
        targets = np.argmax(labels, axis=1)
        #print(targets)
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
        


        if (i+1) % 5000 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_iterations}], Loss: {loss.item():.4f}')
    # Test the model
    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for inputs, labels in test_loader:
            outputs = model(inputs)
            # max returns (value, index)
            _, predicted = torch.max(outputs.data, 1)
            n_samples += labels.size(0)
            n_correct += (predicted == torch.max(labels, 1)[1]).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Model Accuracy : {acc} %')

NUMBER OF SAMPLES =  33297 NUMBER OF ITERATIONS =  8325
Epoch [1/10], Step [5000/8325], Loss: 2.5602
Model Accuracy : 35.870818915801614 %
Epoch [2/10], Step [5000/8325], Loss: 0.3853
Model Accuracy : 26.038062283737023 %
Epoch [3/10], Step [5000/8325], Loss: 2.0640
Model Accuracy : 57.67012687427912 %
Epoch [4/10], Step [5000/8325], Loss: 1.1917
Model Accuracy : 57.641291810841984 %
Epoch [5/10], Step [5000/8325], Loss: 0.9906
Model Accuracy : 51.528258362168394 %
Epoch [6/10], Step [5000/8325], Loss: 2.2077
Model Accuracy : 57.266435986159166 %
Epoch [7/10], Step [5000/8325], Loss: 1.7244
Model Accuracy : 57.67012687427912 %
Epoch [8/10], Step [5000/8325], Loss: 1.0037
Model Accuracy : 57.67012687427912 %
Epoch [9/10], Step [5000/8325], Loss: 1.4657
Model Accuracy : 57.67012687427912 %
Epoch [10/10], Step [5000/8325], Loss: 1.2843
Model Accuracy : 57.75663206459054 %


## Evaluation

In [7]:
# Test the model
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        # max returns (value, index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == torch.max(labels, 1)[1]).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Model Accuracy : {acc} %')


Model Accuracy : 57.75663206459054 %
