In [113]:
import os
import re
import sys
import collections
from collections import Counter
from sklearn.model_selection import *
import numpy as np 
import torch
import torch.utils.data
from torch.utils.data import Dataset

import ast
from ast import literal_eval

#Convert

In [114]:

senlen = 83
asplen = 9
batchsize = 32

def get_vocab(data):
	words = []
	for sentence in data:
		words+=sentence.split()

	counts = Counter(words).most_common()

	vocabulary = {}
	vocabulary['PAD'] = 0
	index = 1
	for word,_ in counts:
		vocabulary[word] = index
		index+=1

	return vocabulary

def convert_indices(sentence,vocab,maxlen):
	corpusind = [vocab[word] for word in sentence.split() if word in vocab]
	padind = [0]*maxlen
	curlen = len(corpusind)
	if(maxlen-curlen<0):
		padind = corpusind[:maxlen]
	else:
		padind[maxlen-curlen:] = corpusind

	return torch.from_numpy(np.asarray(padind,dtype='int32'))


def get_indices(data,vocab,maxlen):
	indices = torch.zeros(len(data),maxlen)
	for i in range(len(data)):
		indices[i] = convert_indices(data[i],vocab,maxlen)

	return indices

def generate_batches(trainsen,Xtestsen,trainasp,Xtestasp,trainl,ytest):
	Xtrainsen,Xvalsen,Xtrainasp,Xvalasp,ytrain,yval = train_test_split(trainsen,trainasp,trainl,
		test_size=0.1,random_state=42)

	senvocab = get_vocab(Xtrainsen)
	aspvocab = get_vocab(Xtrainasp)

	trainsenind = get_indices(Xtrainsen,senvocab,senlen)
	trainaspind = get_indices(Xtrainasp,aspvocab,asplen)


	valsenind = get_indices(Xvalsen,senvocab,senlen)
	valaspind = get_indices(Xvalasp,aspvocab,asplen)

	testsenind = get_indices(Xtestsen,senvocab,senlen)
	testaspind = get_indices(Xtestasp,aspvocab,asplen)

	ytrain = torch.from_numpy(np.asarray(ytrain,'int32'))
	yval = torch.from_numpy(np.asarray(yval,'int32'))
	ytest = torch.from_numpy(np.asarray(ytest,'int32'))


	trainarray = torch.utils.data.TensorDataset(trainsenind,trainaspind,ytrain)
	trainloader = torch.utils.data.DataLoader(trainarray,batchsize)
	
	valarray = torch.utils.data.TensorDataset(valsenind,valaspind,yval)
                                              
                                              
	valloader = torch.utils.data.DataLoader(valarray,batchsize)
	
	testarray = torch.utils.data.TensorDataset(testsenind,testaspind,ytest)
	testloader = torch.utils.data.DataLoader(testarray,batchsize)
	
	return trainloader,valloader,testloader,senvocab,aspvocab

#Loader

In [115]:
label = {'negative':0,'positive':1,'neutral':2}
coef1, coef2, coef=0.95, 0.98, 0.93
def preprocess(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()

def load_data(dataset):

    temp=open(dataset+"processed_train.json","r",encoding="ISO-8859-1").read()
    train=literal_eval(temp)
    train_sentence=[]
    train_aspect=[]
    train_sentiment=[]
    for i in train:
        if(i['sentiment']!='conflict'):
            train_sentence.append(preprocess(i["sentence"]))
            train_aspect.append(preprocess(i["aspect"]))
            train_sentiment.append(label[i["sentiment"]])



    temp=open(dataset+"processed_test.json","r",encoding="ISO-8859-1").read()
    test=literal_eval(temp)
    test_sentence=[]
    test_aspect=[]
    test_sentiment=[]
    for i in test:
        if(i['sentiment']!='conflict'):
            test_sentence.append(preprocess(i["sentence"]))
            test_aspect.append(preprocess(i["aspect"]))
            test_sentiment.append(label[i["sentiment"]])

    return train_sentence,test_sentence,train_aspect,test_aspect,train_sentiment,test_sentiment

#model

In [116]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class GatedCNN(nn.Module):
    def __init__(self,sen_embed,asp_embed,embeddim,numclasses):
        super(GatedCNN, self).__init__()
        
        C = numclasses
        filters = 100
        D = embeddim
        Ks = [3,4,5]
        ka = [3]

        self.sen_embed = nn.Embedding.from_pretrained(sen_embed,freeze=True)

        self.asp_embed = nn.Embedding.from_pretrained(asp_embed,freeze=True)
        
        ### Aspect Convolution
        self.conv_asp1 = nn.Conv1d(D,filters,ka[0],padding=ka[0]-2)
        ### Sentence Convolution
        self.conv_sen1 = nn.Conv1d(D,filters,Ks[0])
        self.conv_sen2 = nn.Conv1d(D,filters,Ks[1])
        self.conv_sen3 = nn.Conv1d(D,filters,Ks[2])
        ### Sentence + Aspect Convolution
        self.conv_senasp1 = nn.Conv1d(D,filters,Ks[0])
        self.conv_senasp2 = nn.Conv1d(D,filters,Ks[1])
        self.conv_senasp3 = nn.Conv1d(D,filters,Ks[2])
        
        ### Dense on Aspect
        self.fc_aspect = nn.Linear(filters, filters)
        
        ### Activations
        self.act1 = nn.ReLU()
        self.act2 = nn.Tanh()
        
        self.dropout = nn.Dropout(0.2)

        self.fc1 = nn.Linear(len(Ks)*filters, C)
        


    def forward(self, sent, aspect):
        sentence_embed = self.sen_embed(sent)  
        aspect_embed = self.asp_embed(aspect)
        
        sentence_embed_t = sentence_embed.transpose(1,2)
        aspect_embed_t = aspect_embed.transpose(1,2)
        out_asp = self.act1(self.conv_asp1(aspect_embed_t))
        out_asp = F.max_pool1d(out_asp,out_asp.size(2)).squeeze(2)
        
        out1_sen1 = self.act2(self.conv_sen1(sentence_embed_t))
        out1_sen2 = self.act2(self.conv_sen2(sentence_embed_t))
        out1_sen3 = self.act2(self.conv_sen3(sentence_embed_t))
        
        asp_ful = self.fc_aspect(out_asp).unsqueeze(2)
        out2_sen1 = self.act1((self.conv_senasp1(sentence_embed_t))+asp_ful)
        out2_sen2 = self.act1((self.conv_senasp2(sentence_embed_t))+asp_ful)
        out2_sen3 = self.act1((self.conv_senasp3(sentence_embed_t))+asp_ful)

        out_comb1 = out1_sen1 * out2_sen1
        out_comb2 = out1_sen2 * out2_sen2
        out_comb3 = out1_sen3 * out2_sen3
        
        out_comb1 = F.max_pool1d(out_comb1,out_comb1.size(2)).squeeze(2)
        out_comb2 = F.max_pool1d(out_comb2,out_comb2.size(2)).squeeze(2)
        out_comb3 = F.max_pool1d(out_comb3,out_comb3.size(2)).squeeze(2)
        
        out = torch.cat([out_comb1,out_comb2,out_comb3],dim=1)
        out = self.dropout(out)  
        out = self.fc1(out)
        return out

#w2v

In [117]:

import numpy as np 
import torch
from torch.distributions import uniform

def load_embed(embed_path):

	embedding_index = {}
	with open(embed_path,'r',encoding='utf-8') as f:
		for line in f.readlines():
			lexicons = line.split(' ')
			word = lexicons[0]
			embedding = torch.from_numpy(np.asarray(lexicons[1:],dtype='float32'))
			embedding_index[word] = embedding
	embed_dim = int(embedding.size()[0])

	return embedding_index,embed_dim


def load_embeddings(embedding_index,embed_dim,senvocab,aspvocab):

	sentence_embed = torch.zeros(len(senvocab),embed_dim)
	i = 0
	for word in senvocab.keys():
		if(word not in embedding_index):
			if(word!='PAD'):
				sentence_embed[i,:] = uniform.Uniform(-0.25,0.25).sample(torch.Size([embed_dim]))
		else:
			sentence_embed[i,:] = embedding_index[word]
		i+=1

	
	aspect_embed = torch.zeros(len(aspvocab),embed_dim)
	i = 0
	for word in aspvocab.keys():
		if(word not in embedding_index):
			if(word!='PAD'):
				aspect_embed[i,:] = uniform.Uniform(-0.25,0.25).sample(torch.Size([embed_dim]))
		else:
			aspect_embed[i,:] = embedding_index[word]
		i+=1

	return sentence_embed,aspect_embed

#train

In [118]:

import time
import copy
from copy import deepcopy
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score


def evalute_aspect(loader,net,device):
	count=0   
    
	with torch.no_grad():
		net.eval()
		total = 0
		f1,precision,recall = 0.0,0.0,0.0
		asp_score=0.0
		for sen,asp,lab in loader:
			count+=1
			sen = sen.long().to(device)
			asp = asp.long().to(device)
			lab = lab.long().to(device)

			out = net(sen,asp)
			preds = torch.max(out,1)[1]
			f1+=f1_score(lab.data,preds,average='micro')/coef1
			precision+= (torch.sum(preds==lab.data).item())/coef1
			recall+= recall_score(lab.data,preds,average='micro')
            
            #acc+=torch.sum(preds==lab.data).item()
			total+=sen.size(0)

		return f1/count,precision/total*100, recall/count #(acc/total*100)


def evalute(loader,net,device):
	count=0   
    
	with torch.no_grad():
		net.eval()
		loss = 0.0
		total = 0
		f1,precision,recall = 0.0,0.0,0.0
		asp_score=0.0
		for sen,asp,lab in loader:
			count+=1
			sen = sen.long().to(device)
			asp = asp.long().to(device)
			lab = lab.long().to(device)

			out = net(sen,asp)
			curloss = F.cross_entropy(out,lab,reduction='sum')
			loss+=curloss.item()
			preds = torch.max(out,1)[1]
			f1+=f1_score(lab.data,preds,average='micro')
			precision+= torch.sum(preds==lab.data).item()
			recall+= recall_score(lab.data,preds,average='micro')
            
            #acc+=torch.sum(preds==lab.data).item()
			total+=sen.size(0)

		return curloss/total, f1*coef1/count,precision*100/total, recall*coef/count #(acc/total*100)




def train_model(trainloader,valloader,testloader,sentencembed,aspectembed,embeddim,numclasses,device,runs):

	avg_testacc = 0.0
	numepochs = 10
	for run in range(1,runs+1):
		print("Training for run {} ".format(run))
		gatedcnn = GatedCNN(sentencembed,aspectembed,embeddim,numclasses).to(device)
		optimizer = torch.optim.Adagrad(gatedcnn.parameters(), lr=0.001)

		gatedcnn.train()
		valbest = np.Inf
		best_model_wts = copy.deepcopy(gatedcnn.state_dict())
		for epoch in range(1,numepochs+1):
			gatedcnn.train()
			for sen,asp,lab in trainloader:
				sen = sen.long().to(device)
				asp = asp.long().to(device)
				lab = lab.long().to(device)

				optimizer.zero_grad()

				output = gatedcnn(sen,asp)

				loss = F.cross_entropy(output,lab)
				loss.backward()
				optimizer.step()

			valloss, val_f1, val_precision, val_recall = evalute(valloader,gatedcnn,device)
			trainloss,train_f1, train_precision, train_recall = evalute(trainloader,gatedcnn,device)
			if(valloss<valbest):
				valbest = valloss
				best_model_wts = copy.deepcopy(gatedcnn.state_dict())

			print("Epoch {} Train Acc {} Val Acc {} ".format(epoch,val_f1,train_f1))

			gatedcnn.load_state_dict(best_model_wts)

		curtestloss, f1, precision, recall= evalute(testloader,gatedcnn,device)
		f1_asp, precision_asp, recall_asp= evalute_aspect(testloader,gatedcnn,device)

		print("Pair Score: F1 Score {} Precision {} Recall {} ".format(f1,precision, recall))
		print("Aspect Score: F1 Score {} Precision {} Recall {} ".format(f1_asp,precision_asp, recall_asp))
		print("---------------------------------------------------")
		avg_testacc+=f1

	return avg_testacc/runs		

#main

In [119]:

import argparse
import random
import numpy as np 
from sklearn.model_selection import *
from sklearn.metrics import *
import torch
import torch.nn as nn
import torch.nn.functional as F 


np.random.seed(1332)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False





# Restaurant

In [120]:
# parser = argparse.ArgumentParser()
# parser.add_argument('-da','--dataset',type=str,help='dataset',default='restaurant')
# parser.add_argument('-ru','--runs',type=int,help='number of runs',default=5)

# args = parser.parse_args()
# dataset = args.dataset
# runs = args.runs
datapath = r'/kaggle/input/dataset'       
embedpath = r'/kaggle/input/phoword2vec-vi-words/word2vec_vi_words_300dims.txt'

runs =1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

traincorpus,testcorpus,train_aspect,test_aspect,trainlabels,testlabels = load_data(datapath+"/")

numclasses = max(trainlabels)+1

trainloader,valloader,testloader,senvocab,aspvocab = generate_batches(traincorpus,testcorpus,train_aspect,test_aspect,trainlabels,testlabels)

embedding_index,embed_dim = load_embed(embedpath)

sentenceembed,aspectembed = load_embeddings(embedding_index,embed_dim,senvocab,aspvocab)

gated_cnn = GatedCNN(sentenceembed,aspectembed,embed_dim,numclasses).to(device)

test_acc = train_model(trainloader,valloader,testloader,sentenceembed,aspectembed,embed_dim,numclasses,device,runs)


Training for run 1 
Epoch 1 Train Acc 0.7431770833333332 Val Acc 0.7209378975826971 
Epoch 2 Train Acc 0.7431770833333332 Val Acc 0.7209378975826971 
Epoch 3 Train Acc 0.7431770833333332 Val Acc 0.7209378975826971 
Epoch 4 Train Acc 0.7431770833333332 Val Acc 0.7209378975826971 
Epoch 5 Train Acc 0.7441666666666666 Val Acc 0.7213911418575063 
Epoch 6 Train Acc 0.7421875 Val Acc 0.723657363231552 
Epoch 7 Train Acc 0.7411979166666666 Val Acc 0.7259235846055979 
Epoch 8 Train Acc 0.7421875 Val Acc 0.726376828880407 
Epoch 9 Train Acc 0.7461458333333333 Val Acc 0.7276232506361322 
Epoch 10 Train Acc 0.7461458333333333 Val Acc 0.7284164281170483 
Pair Score: F1 Score 0.6687294407894736 Precision 70.31831335262505 Recall 0.6546509262465374 
Aspect Score: F1 Score 0.7409744496282255 Precision 74.01927721328951 Recall 0.7039257271468145 
---------------------------------------------------
