In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import WordNetLemmatizer
from string import digits
from string import punctuation

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [7]:
def remove_stopwords(text: str) -> str:
    stop_word_list = stopwords.words('english')
    tokenizer = ToktokTokenizer()
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stop_word_list]
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

def lemmatization(text: str) -> str:
    tokens = word_tokenize(text)
    wl = WordNetLemmatizer()
    lemma_tokens= []
    for token in text.split():
        word1 = wl.lemmatize(token,pos = "n")
        word2 = wl.lemmatize(word1,pos = "v")
        word3 = wl.lemmatize(word2,pos = "a")
        word4 = wl.lemmatize(word3,pos = "r")
        lemma_tokens.append(word4)
    preprocessed_text = ' '.join(lemma_tokens)
    return preprocessed_text
    
def remove_punctuation(text: str) -> str:
    tokens = [token for token in text if token not in punctuation]
    preprocessed_text = ''.join(tokens)
    return preprocessed_text

def remove_digit(text: str) -> str:
    tokens = [token for token in text if token not in digits]
    preprocessed_text = ''.join(tokens)
    return preprocessed_text
    
def preprocessing_function(text: str) -> str:
    preprocess_text = text.lower()
    #preprocessed_text = lemmatization(text)
    #preprocessed_text = remove_stopwords(preprocessed_text)
    #preprocessed_text = remove_punctuation(preprocessed_text)
    #preprocessed_text = remove_digit(preprocessed_text)
    return preprocess_text

In [8]:
def label_map(label):
    if label == "neutral":
        return 0
    elif label == "anger":
        return 1
    elif label == "joy":
        return 2
    elif label == "surprise":
        return 3
    elif label == "sadness":
        return 4
    elif label == "disgust":
        return 5
    elif label == "fear":
        return 6

def man_map(speaker):
    if speaker == "Ross":
        return 1
    elif speaker == "Joey":
        return 2
    elif speaker == "Rachel":
        return 3
    elif speaker == "Phoeba":
        return 4
    elif speaker == "Monica":
        return 5
    elif speaker == "Chandler":
        return 6
    else:
        return 7

def encode(text, word2index, label, speaker, N):
    tokenized = word_tokenize(text)
    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    #encoded.insert(man_map(speaker) ,0)
    return (encoded,label)

def encode_test(text, word2index, speaker, N):
    tokenized = word_tokenize(text)
    for i,word in enumerate(tokenized):
        if word2index.get(word)==None:
            tokenized[i]='unk'
    encoded = [0]*N
    enc1 = [word2index.get(word) for word in tokenized]
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    #encoded.insert(man_map(speaker) ,0)
    return encoded

In [10]:
train_df = pd.read_csv('train_HW2dataset.csv')
dev_df = pd.read_csv('dev_HW2dataset.csv')

train_df = train_df[['Emotion','Utterance','Speaker']]
dev_df = dev_df[['Emotion','Utterance','Speaker']]

train_set = list(train_df.to_records(index=False))
dev_set = list(dev_df.to_records(index=False))

counts = Counter()
new_set = []
for ds in [train_set, dev_set]:
    for label,text,speaker in ds:
        text = preprocessing_function(text)
        new_set.append((label,text,speaker))
        counts.update(word_tokenize(text))

word2index = {'unk':0}
for i,word in enumerate(counts.keys()):
    word2index[word] = i+1
index2word = {v:k for k,v in word2index.items()}

emb_size = 300
c = 0
pretrained_weight = np.zeros((len(word2index), emb_size), dtype="float32")

with open('glove.840B.300d.txt','rt',encoding="utf-8") as fi:
    for line in fi:
        i_word = line.split(' ')[0]
        i_embeddings = [float(val) for val in line.split(' ')[1:]]
        if i_word in word2index:
            pretrained_weight[word2index[i_word]] = i_embeddings
            c += 1

new_encoded = [(encode(Utterance,word2index,label_map(label), speaker, 20)) for label, Utterance, speaker in new_set]

x = np.array([tweet for tweet, label in new_encoded])
y = np.array([label for tweet, label in new_encoded])


In [11]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    for batch in iterator:
        text = batch[0].to(device)
        target = batch[1]
        target = target.type(torch.LongTensor)
        target = target.to(device)
        preds = model(text)
        loss = criterion(preds, target)
        _, pred = torch.max(preds, 1)
        acc = accuracy_score(pred.tolist(), target.tolist())
        epoch_loss += loss.item()
        epoch_acc += acc
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [12]:
src_vocab_size = len(word2index)     
dimension_model = 300                             
num_layers = 5                       
hidden_size = 256          
linear_hidden_size = 32   
classes = 7                          
dropout = 0.2                          
lr = 1e-3      

In [13]:
def cross_validation(sample_size, k=5):
    # calculate the size of each fold
    fold_size = np.ones(k , dtype=int) * (sample_size // k) 
    fold_size[:sample_size % k] += 1 # if the sample size is not divisible by k
    # shuffle
    indexs = np.arange(sample_size)
    np.random.default_rng().shuffle(indexs)
    # split indexs  
    folds = np.array_split(indexs, k)
    folds = np.array(folds)
    # return training index and val index in each fold
    ret = []
    for i in range(k):
        train_fold_ind = np.delete(np.arange(k), i)
        train_fold = np.concatenate((folds[train_fold_ind]), axis=None) 
        val_fold = folds[i]
        ret.append([train_fold, val_fold])
    return ret


In [14]:
class LSTM(torch.nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        #self.embed = torch.nn.Embedding(src_vocab_size, dimension_model) 
        self.embed = torch.nn.Embedding.from_pretrained(torch.from_numpy(pretrained_weight).float())                       
        self.lstm = torch.nn.LSTM(input_size=dimension_model, hidden_size=hidden_size,
                                    num_layers=num_layers,dropout=dropout)
        self.linear = torch.nn.Linear(hidden_size, linear_hidden_size)
        self.fc = torch.nn.Linear(linear_hidden_size, classes)
    
    def attention(self, lstm_output, final_state):
        lstm_output = lstm_output.permute(1, 0, 2) 
        merged_state = torch.mean(final_state, dim=0) 
        merged_state = merged_state.unsqueeze(2)
        weights = torch.bmm(lstm_output, merged_state)
        weights = torch.nn.functional.softmax(weights.squeeze(2), dim=1).unsqueeze(2)
        ret = torch.bmm(torch.transpose(lstm_output, 1, 2), weights).squeeze(2)
        return ret

    def forward(self,data):
        x = self.embed(data)                    
        x,(h_n, c_n) = self.lstm(x.transpose(0, 1)) 
        x = self.attention(x, h_n) 
        x = self.linear(x)  
        x = self.fc(x)                          
        return x 

In [15]:

kfold_data = cross_validation(sample_size = x.shape[0], k=10)
batch_size = 32

for train_index, val_index in kfold_data:
    train_x = x[train_index]
    train_y = y[train_index]
    dev_x = x[val_index]
    dev_y = y[val_index]
    train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
    dev_ds = TensorDataset(torch.from_numpy(dev_x), torch.from_numpy(dev_y))

    train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
    dev_dl = DataLoader(dev_ds, shuffle=True, batch_size=batch_size, drop_last=True)
    
    best_acc = 0
    model = LSTM().to(device)                           
    criterion = torch.nn.CrossEntropyLoss()                  
    optimizer = torch.optim.Adam(model.parameters(),lr=lr)
    for epoch in range(5):
        train_loss, train_acc = train(model, train_dl, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, dev_dl, criterion)
        print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc * 100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc * 100:.2f}%')    
        if best_acc <= valid_acc:
            best_acc = valid_acc
            PATH = f"epoch{epoch+1}_val.accuracy{valid_loss:.3f}%.pt"
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': valid_loss,
                }, PATH)

  # Remove the CWD from sys.path while we load stuff.


Epoch: 01, Train Loss: 1.498, Train Acc: 49.18%, Val. Loss: 1.393, Val. Acc: 52.84%
Epoch: 02, Train Loss: 1.373, Train Acc: 53.27%, Val. Loss: 1.323, Val. Acc: 54.62%
Epoch: 03, Train Loss: 1.306, Train Acc: 55.74%, Val. Loss: 1.268, Val. Acc: 56.53%
Epoch: 04, Train Loss: 1.239, Train Acc: 58.36%, Val. Loss: 1.230, Val. Acc: 57.74%
Epoch: 05, Train Loss: 1.189, Train Acc: 60.40%, Val. Loss: 1.204, Val. Acc: 58.66%
Epoch: 01, Train Loss: 1.501, Train Acc: 47.92%, Val. Loss: 1.446, Val. Acc: 49.22%
Epoch: 02, Train Loss: 1.384, Train Acc: 52.31%, Val. Loss: 1.363, Val. Acc: 53.76%
Epoch: 03, Train Loss: 1.324, Train Acc: 54.27%, Val. Loss: 1.318, Val. Acc: 53.84%
Epoch: 04, Train Loss: 1.285, Train Acc: 56.25%, Val. Loss: 1.283, Val. Acc: 55.54%
Epoch: 05, Train Loss: 1.244, Train Acc: 57.49%, Val. Loss: 1.244, Val. Acc: 56.61%
Epoch: 01, Train Loss: 1.568, Train Acc: 45.83%, Val. Loss: 1.572, Val. Acc: 45.31%
Epoch: 02, Train Loss: 1.447, Train Acc: 50.75%, Val. Loss: 1.366, Val. Acc:

In [None]:
test_df = pd.read_csv('test_HW2dataset.csv')
test_df = test_df[['Utterance','Speaker']]
test_set = test_df.values.tolist()

new_test_set = []
for ds in [test_set]:
    for text,speaker in ds:
        text = preprocessing_function(text)
        new_test_set.append((text,speaker))

test_encoded = [(encode_test(Utterance, word2index, speaker, 18)) for Utterance, speaker in new_test_set]
test_x = np.array(test_encoded)
test_ds = TensorDataset(torch.from_numpy(test_x))
test_dl = DataLoader(test_ds, shuffle=False)
model = LSTM().to(device)                               
criterion = torch.nn.CrossEntropyLoss()                  
optimizer = torch.optim.Adam(model.parameters(),lr=lr) 
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']

model.eval()
predict=[]
for deta in test_dl:
    text = deta[0].to(device)
    preds = model(text)
    _, pred = torch.max(preds, 1)
    predict.append(pred.item())

In [None]:
ans = pd.DataFrame(columns=["index","emotion"])
for ind, p in enumerate(predict):
    ans.loc[ind] = [ind,p]
ans.to_csv("predict.csv",index=False)

NameError: name 'predict' is not defined