## LSTM + Fully Connected Layer

This file streamlines the LSTM and fully connected layer process. It validates the for/against vectors based on results. 

1. Fine-tune embedding

2. Same initial hidden states (DONE)

3. Check dropout in evaluation (DONE, no dropout) 

4. Validation set (DONE)

4. Check fully connected weights 

6. Add f.c. layers (DOING)

7. Mean + std of several runs

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from point2mn import point2mn
from point2mn import get_main_points
torch.manual_seed(1)

<torch._C.Generator at 0x11b568f50>

In [2]:
import numpy as np
import pandas as pd

## Model Methods

### Word Embedding Fine tune

In [3]:
# Generates the glove embedding, fine-tune to be done.
def generate_embedding(filename='glove.6B.100d.txt'):
    embedding_index = {}
    f = open(filename)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:],dtype='float32')
        embedding_index[word] = coefs
    f.close()
    print('Formed word vecs from', filename, ": ", len(embedding_index))
    return embedding_index

In [4]:
embedding_index = generate_embedding()

Formed word vecs from glove.6B.100d.txt :  400000


### LSTM

In [5]:
# LSTM methods
def run_lstm_one_sentence(sentence, title, lstm, hidden, embedding_index):
    inputs = point2mn(sentence, title, embedding_index)
    tensor_input = [torch.tensor([x]) for x in inputs]
    tensor_input = torch.cat(tensor_input).view(1, len(tensor_input), -1)
    #hidden = (torch.randn(1, 1, HIDDEN_DIM), torch.randn(1, 1, HIDDEN_DIM)) #???????
    tensor_output, hidden = lstm(tensor_input, hidden)
    # should return hidden state and cell state in 'hidden' instead of output
    return hidden

def run_lstm_on_fid(fid, embedding_index, lstm, hidden, combine_func = [torch.mean, torch.max]):
    """
    Given fid, find main points for the debate and for each sentence, pass corresponding 
    matching vectors to lstm and get hidden state in the end. Gather the hidden states
    in a list and do combine_func.
    -combine_func: the funtion applied to combine lstm outputs from one side elementwisely
    """
    
    title = main_points[main_points['id'] == fid].title.iloc[0]
    for_main_points, against_main_points = get_main_points(fid, main_points)
    for_output_list = []
    against_output_list = []
    
    for sentence in for_main_points:
        hidden_state, cell_state = run_lstm_one_sentence(sentence, title, lstm, hidden, embedding_index)
        for_output_list.append(hidden_state)
        
    for sentence in against_main_points:
        hidden_state, cell_state = run_lstm_one_sentence(sentence, title, lstm, hidden, embedding_index)
        against_output_list.append(hidden_state)
    for_torchs = []
    against_torchs = []
    for combine_f in combine_func:
        if combine_f == torch.mean:
            for_torch = combine_f(torch.stack(for_output_list), dim = 0, keepdim = True)#[0]
            against_torch = combine_f(torch.stack(against_output_list), dim = 0, keepdim = True)#[0]
        else:
            for_torch = combine_f(torch.stack(for_output_list), dim = 0, keepdim = True)[0]
            against_torch = combine_f(torch.stack(against_output_list), dim = 0, keepdim = True)[0]
        for_torchs.append(for_torch)
        against_torchs.append(against_torch)
    if len(for_torchs) == 2:
        t1 = torch.cat((for_torchs[0], for_torchs[1]), dim = 2)
        t2 = torch.cat((against_torchs[0], against_torchs[1]), dim = 2)
        return torch.cat((t1, t2), dim = 2)
    return torch.cat((for_torchs[0], against_torchs[0]), dim = 2)

### Fully Connected Layer

In [34]:
class LSTMNet(nn.Module):
    def __init__(self, output_size_1, output_size_2, n_layers, embedding_dim, hidden_dim, combine_funcs, hidden, drop_prob=0.5):
        super(LSTMNet, self).__init__()
        self.output_size_1 = output_size_1
        self.output_size_2 = output_size_2
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.combine_funcs = combine_funcs
        self.hidden = hidden

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(hidden_dim * 2 * len(self.combine_funcs), self.output_size_1)
        #self.fc2 = nn.Linear(output_size_1, self.output_size_2)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, fids):
        batch_size = len(fids)
        lstm_out = torch.stack([run_lstm_on_fid(fid, embedding_index, self.lstm, self.hidden, \
                                                combine_func = self.combine_funcs) for fid in fids])
    
        lstm_out = lstm_out.contiguous().view(-1, len(self.combine_funcs) * 2 * self.hidden_dim)
        out = self.dropout(lstm_out)
        out = self.fc1(out)
        #out = self.fc2(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out

In [35]:
def training_results(outputsize_1, outputsize_2, N_LAYERS, embedding_dim, hidden_dim, combine_funcs, drop_prob, lr, epochs):
    print('output size 1 =', outputsize_1, 'output size 2 =', outputsize_2,'n_layers =', N_LAYERS, 'embedding_dim = ', embedding_dim, \
         'hidden_dim = ', hidden_dim, 'combine_funcs = ', str(combine_funcs), 'drop_prob = ', drop_prob, \
          'lr = ', lr, 'epochs = ', epochs)
    return pd.DataFrame({'# Epoch': range(1,epochs + 1), 'training acc': train_acc,\
                         'validation acc': val_acc, 'testing acc': test_acc})

### Data Processing

In [36]:
# Meta Data of debates
main_points = pd.read_csv('../Meta Data/metadata_appended_main_points.csv') 
main_points.dropna(subset = ['For_Main_Points', 'against_Main_Points'], inplace = True)
main_points = main_points.reset_index(drop = True)

In [37]:
# Set the winner label
results = pd.read_csv('../results_data/final_online.csv') 
results = results.loc[results['winner'].apply(lambda x: x != 'undecided')]
results['winner'] = results['winner'].apply(lambda x: 1 if x == 'for' else 0)
id2winner = results[['id', 'winner']].set_index('id').to_dict()['winner']

main_points['label'] = main_points.id.apply(lambda x: id2winner[x] if x in id2winner else np.nan)
main_points = main_points.dropna(subset = ['label']).reset_index(drop = True)
main_points.head(2)

Unnamed: 0,id,title,date,for,against,For_Main_Points,against_Main_Points,label
0,d20191112,Capitalism Is a Blessing,2019-11-12,"['John Mackey', 'Katherine Mangu-Ward']","['Bhaskar Sunkara', 'Richard D. Wolff']",['By promoting market competition and rewardin...,['Capitalism serves the interests of large cor...,0.0
1,d20191029,Parenting Is Overrated,2019-10-29,"['Robert Plomin', 'Nancy Segal']","['Paige Harden', 'Ann Pleshette Murphy']","[""We're in the midst of a DNA revolution: Whil...","['While DNA is important, factors like familia...",1.0


In [38]:
# Input and Train/Vadidation/Test
inputs = main_points.id.tolist()
labels =  main_points.label.tolist()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size = 0.2, random_state = 5)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1, random_state = 0)

y_train = torch.tensor(y_train)
y_val = torch.tensor(y_val)
y_test = torch.tensor(y_test)

print("X Train:", len(X_train), "\nX Validation: ", len(X_val), "\nX Test: ", len(X_test))

X Train: 85 
X Validation:  10 
X Test:  24


### Performance Measureament

In [43]:
EMBEDDING_DIM = 200
HIDDEN_DIM = 50
# OUTPUT_SIZE_1 = 20
OUTPUT_SIZE_1 = 1
COMBINE_FUNCS =[torch.mean, torch.max]
OUTPUT_SIZE_2 = 1
N_LAYERS = 1
DROPOUT_PROB = 0.5
HIDDEN_INITIAL = (torch.randn(1, 1, HIDDEN_DIM), torch.randn(1, 1, HIDDEN_DIM))

In [44]:
model = LSTMNet(OUTPUT_SIZE_1, OUTPUT_SIZE_2, N_LAYERS, EMBEDDING_DIM, HIDDEN_DIM, COMBINE_FUNCS, HIDDEN_INITIAL,DROPOUT_PROB)

In [45]:
lr = 0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [86]:
epochs = 20
counter = 0
train_acc = []
train_acc2 = []

val_acc = []
test_acc = []
model.train()

for i in range(1):
    
    model.train()
    model.zero_grad()
    
    output = model(X_train)
    loss = criterion(output.squeeze(), y_train.float())
    #print(loss.item())
    loss.backward()
    optimizer.step()
    
    model.eval()
    outs = model(X_train)
    train_acc.append(torch.sum((outs > 0.5) == y_train).item() / len(y_train))

    outs = model(X_val)
    val_acc.append(torch.sum((outs > 0.5) == y_val).item() / len(y_val))
    
    outs = model(X_test)
    test_acc.append(torch.sum((outs > 0.5) == y_test).item() / len(y_test))

In [22]:
training_results(OUTPUT_SIZE_1, OUTPUT_SIZE_2, N_LAYERS, EMBEDDING_DIM, HIDDEN_DIM, [torch.mean, torch.max], DROPOUT_PROB, lr, epochs)

output size 1 = 20 output size 2 = 1 n_layers = 1 embedding_dim =  200 hidden_dim =  50 combine_funcs =  [<built-in method mean of type object at 0x11ae27eb0>, <built-in method max of type object at 0x11ae27eb0>] drop_prob =  0.5 lr =  0.005 epochs =  20


Unnamed: 0,# Epoch,training acc,validation acc,testing acc
0,1,0.658824,0.6,0.583333
1,2,0.8,0.6,0.708333
2,3,0.823529,0.6,0.666667
3,4,0.858824,0.6,0.666667
4,5,0.752941,0.7,0.5
5,6,0.858824,0.6,0.625
6,7,0.717647,0.7,0.583333
7,8,0.870588,0.6,0.541667
8,9,0.882353,0.6,0.541667
9,10,0.905882,0.7,0.583333


### Check the fully connected layer weight

In [84]:
def compute_weight_acc(weight_t, DIM=200):
    D = int(DIM / 2)
    positive = len([1 for i in weight_t[0][:D] if i > 0])
    negative = len([1 for i in weight_t[0][D:] if i < 0])
    print("TP: ", positive / 100, "TN: ", negative / 100)

In [85]:
epochs = 20
counter = 0

for i in range(epochs):
    
    model.train()
    model.zero_grad()
    
    # get weight
    compute_weight_acc(model.fc1.weight)
    #print(model.fc1.weight)

TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
TP:  0.5 TN:  0.53
