In [1]:
import pandas as pd
from sklearn import feature_extraction, model_selection, metrics, naive_bayes
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
import numpy as np
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
# import gensim
# from gensim.models.word2vec import Word2Vec
import warnings
import string
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')
# from senticnet.senticnet import SenticNet
from sentic import SenticPhrase
import collections

In [2]:
# Read lexicon file 
fpositive = open('positive-words.txt','r',encoding = "ISO-8859-1")
positive_words = []
for each in fpositive:
    positive_words.append(each[:-1])
negative_words = []
fnegative = open('negative-words.txt','r', encoding = "ISO-8859-1")
for each1 in fnegative:
    negative_words.append(each1[:-1])
# print(positive_words[0])
# print(negative_words[0])

In [3]:
def readTrainDataandSample(data,dosampling,rate):
    ## Import csv using pandas
    df = pd.read_csv(data,index_col=0)
    df.dropna(inplace=True)
    df.reset_index(drop=True,inplace=True)
    df.info()
    
    if(dosampling):
        target_count = df[' class'].value_counts()
        print("Before Sampling: ",target_count)
        
        max_size = (df[' class'].value_counts().max())

        lst = [df]
        for class_index, group in df.groupby(' class'):
            lst.append(group.sample(int(abs(max_size-len(group))/rate), replace=True))
        frame_new = pd.concat(lst)

        target_count = frame_new[' class'].value_counts()
        print("After sampling:",target_count)

#         target_count.plot(kind='bar', title='Count (target)')

        sentence = frame_new[' text']
        aspectLocation = frame_new[' term_location']
        aspectTerm = frame_new[' aspect_term']
        y = frame_new[' class']

        df = pd.concat([sentence,aspectLocation,aspectTerm,y],axis=1)
        df.reset_index()
        df
        df.to_csv('finalizedText.csv')
    else:
        target_count = df[' class'].value_counts()
#         target_count.plot(kind='bar', title='Count (target)')
        return df

In [4]:
def dataCleaning(sentence):
    # def dataCleaning(sentence):
    sentenceCopy = sentence.copy()
    # y = y.copy()
    # Lower case words
    sentenceCopy = sentenceCopy.apply(lambda x: x.lower())
    # Replace comma with ,
    sentenceCopy = sentenceCopy.apply(lambda x: x.replace('[comma]', ','))
    # Remove numbers
    table = str.maketrans(dict.fromkeys(string.digits))
    sentenceCopy = sentenceCopy.apply(lambda x: x.translate(table))
    # Remove Punctuations
    table = str.maketrans(dict.fromkeys(string.punctuation))
    sentenceCopy = sentenceCopy.apply(lambda x: x.translate(table))
    
    return sentenceCopy

In [5]:
# Remove stop words and store each word
def removeStopwords(sentence):
    global df1
    sentenceCopy = sentence
    stop_words = set(stopwords.words('english')) 
    inputSentenceWords = []
    # aspect = [] 
    for i,each in enumerate(sentenceCopy):
        tokendWords = word_tokenize(each)
#         if(len(tokendWords) <=500):
        finalWords = [w for w in tokendWords if w not in stop_words]
        inputSentenceWords.append(finalWords)
#         else:
#             print(i)
#             df1.drop(df1[' class'][i])
    return inputSentenceWords

In [6]:
# Convert words to input sentence
def convertListofwords(inputSentenceWords):
    finalInputSentence = []
    for eachArray in inputSentenceWords:
        x = ' '.join(word for word in eachArray)
        finalInputSentence.append(x)
    return finalInputSentence

In [7]:
def lexicon(inputSentenceWords):
    global aspectTerm
    lexicon = []
    mat = []
    for i,eachSentence in enumerate(inputSentenceWords):
        total = 0
        as1 = str(aspectTerm[i])
        modifiedAspect = as1.split(' ')
        q = 0
        c = 0
    #     print(i)
        for eachWord in eachSentence:
            if(eachWord not in modifiedAspect):
                sp = SenticPhrase(eachWord)
                q += sp.get_polarity(eachWord)
                c += 1 
                if(eachWord in positive_words):
                    total += 1
                elif(eachWord in negative_words):
                    total -= 1
                else:
                    total += 0
        lexicon.append(total)
    #     q = 1
    #     c = 1
        if(c != 0):
            mat.append(total)
        else:
            mat.append(0)
    return mat

In [8]:
def getTFIDFVector(finalInputSentence,mind,nrange):
    vec = feature_extraction.text.TfidfVectorizer(min_df = mind, max_df = 0.8, sublinear_tf=True, use_idf=True, analyzer= 'word', ngram_range=(1,nrange),lowercase=True)
    trainDataVecs = vec.fit_transform(finalInputSentence)
#     print(type(trainDataVecs))
    tempMatrix = trainDataVecs.toarray()
    print(tempMatrix.shape)
#     print(type(tempMatrix))
    return trainDataVecs , tempMatrix , vec

In [9]:
def write(idno,preds,oneortwo):
    a = collections.Counter(preds)
    print("0 count: " , a[0])
    print("1 count: " , a[1])
    print("-1 count: " , a[-1])
    
    if (oneortwo == 1):
        output = open('Mrudula_Borkar_GopiKrishnan_NarasimhaGuptha_Data-1.txt','w')
    else:
        output = open('Mrudula_Borkar_GopiKrishnan_NarasimhaGuptha_Data-2.txt','w')    
    for i,y in enumerate(preds):
        output.write(idno[i]+";;"+str(y)+'\n')  

# Train Data 1

In [10]:
readTrainDataandSample("data-1_train.csv",True,5)

df1 = pd.read_csv("finalizedText.csv")
print(df1.head())

# df1 = readTrainDataandSample("data-1_train.csv",False,0)

sentence = df1[' text']
sentenceCopy = dataCleaning(sentence)

inputSentenceWords = removeStopwords(sentenceCopy)

sentence = df1[' text']
aspectTerm = df1[' aspect_term']
y = df1[' class']

print(y.shape)
print(aspectTerm.shape)
print(len(inputSentenceWords))

print(sentenceCopy[6])

print(inputSentenceWords[6])

finalInputSentence = convertListofwords(inputSentenceWords)

print(finalInputSentence[6])

matasarr = np.asarray(lexicon(inputSentenceWords))
# trainDataVecs , tempMatrix, idfmode11 = getTFIDFVector(finalInputSentence,0.000125,4)
trainDataVecs , tempMatrix, idfmode11 = getTFIDFVector(finalInputSentence,0.000125,4)


Xtrain = np.zeros((tempMatrix.shape[0],tempMatrix.shape[1]+1),dtype=float)

for i in range(matasarr.shape[0]):
    Xtrain[i] = np.hstack((tempMatrix[i],matasarr[i]))
print(Xtrain.shape)
print(y.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2203 entries, 0 to 2202
Data columns (total 4 columns):
 text             2203 non-null object
 aspect_term      2203 non-null object
 term_location    2203 non-null object
 class            2203 non-null int64
dtypes: int64(1), object(3)
memory usage: 68.9+ KB
Before Sampling:   1    939
-1    828
 0    436
Name:  class, dtype: int64
After sampling:  1    939
-1    850
 0    536
Name:  class, dtype: int64
   Unnamed: 0                                               text  \
0           0  Obviously one of the most important features o...   
1           1     Good for every day computing and web browsing.   
2           2  while the keyboard itself is alright[comma] th...   
3           3  Again[comma] the same problem[comma] the right...   
4           4         My problem was with DELL Customer Service.   

   term_location            aspect_term   class  
0         69--84        human interface       0  
1          9--28    every day c

In [11]:
# SVM on data 1

clfData1=LinearSVC(multi_class='crammer_singer',random_state=0)
print("Fitting SVM to training data....")    
preds = model_selection.cross_val_predict(clfData1, Xtrain, y, cv=10)
# print(preds)
accScore = metrics.accuracy_score(y,preds)
labels = [-1, 0, 1]
precision = metrics.precision_score(y,preds,average=None,labels=labels)
recall = metrics.recall_score(y,preds,average=None,labels=labels)
f1Score = metrics.f1_score(y,preds,average=None,labels=labels)
print("\nOverall Acurracy - SVM: ",accScore,"\n")
for i in range(len(labels)):
    print("Precision of %s class: %f" %(labels[i],precision[i]))
    print("Recall of %s class: %f" %(labels[i],recall[i]))
    print("F1-Score of %s class: %f" %(labels[i],f1Score[i]),"\n")
clfData1.fit(Xtrain, y)

Fitting SVM to training data....

Overall Acurracy - SVM:  0.753978494623656 

Precision of -1 class: 0.725079
Recall of -1 class: 0.812941
F1-Score of -1 class: 0.766500 

Precision of 0 class: 0.735714
Recall of 0 class: 0.576493
F1-Score of 0 class: 0.646444 

Precision of 1 class: 0.790966
Recall of 1 class: 0.801917
F1-Score of 1 class: 0.796404 



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='crammer_singer', penalty='l2', random_state=0,
     tol=0.0001, verbose=0)

# Train data 2

In [12]:
readTrainDataandSample("data-2_train.csv",True,4)

df1 = pd.read_csv("finalizedText.csv")
print(df1.head())

# df1 = readTrainDataandSample("data-2_train.csv",False,0)

sentence = df1[' text']
sentenceCopy = dataCleaning(sentence)

inputSentenceWords = removeStopwords(sentenceCopy)

sentence = df1[' text']
aspectTerm = df1[' aspect_term']
y = df1[' class']

print(y.shape)
print(aspectTerm.shape)
print(len(inputSentenceWords))

print(sentenceCopy[6])

print(inputSentenceWords[6])

finalInputSentence = convertListofwords(inputSentenceWords)

print(finalInputSentence[6])

matasarr = np.asarray(lexicon(inputSentenceWords))
trainDataVecs , tempMatrix , idfmode12 = getTFIDFVector(finalInputSentence,0.000125,3)

Xtrain = np.zeros((tempMatrix.shape[0],tempMatrix.shape[1]+1),dtype=float)

for i in range(matasarr.shape[0]):
    Xtrain[i] = np.hstack((tempMatrix[i],matasarr[i]))
print(Xtrain.shape)
print(y.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3602 entries, 0 to 3601
Data columns (total 4 columns):
 text             3602 non-null object
 aspect_term      3602 non-null object
 term_location    3602 non-null object
 class            3602 non-null int64
dtypes: int64(1), object(3)
memory usage: 112.6+ KB
Before Sampling:   1    2164
-1     805
 0     633
Name:  class, dtype: int64
After sampling:  1    2164
-1    1144
 0    1015
Name:  class, dtype: int64
   Unnamed: 0                                               text  \
0           0               But the staff was so horrible to us.   
1           1  To be completely fair[comma] the only redeemin...   
2           2  The food is uniformly exceptional[comma] with ...   
3           3  The food is uniformly exceptional[comma] with ...   
4           4  The food is uniformly exceptional[comma] with ...   

   term_location  aspect_term   class  
0          8--13        staff      -1  
1         57--61         food       1  
2   

In [13]:
# SVM on data 2

clfData2=LinearSVC(multi_class='crammer_singer',random_state=0)
print("Fitting SVM to training data....")    
preds = model_selection.cross_val_predict(clfData2, Xtrain, y, cv=10)
# print(preds)
accScore = metrics.accuracy_score(y,preds)
labels = [-1, 0, 1]
precision = metrics.precision_score(y,preds,average=None,labels=labels)
recall = metrics.recall_score(y,preds,average=None,labels=labels)
f1Score = metrics.f1_score(y,preds,average=None,labels=labels)
print("\nOverall Acurracy - SVM: ",accScore,"\n")
for i in range(len(labels)):
    print("Precision of %s class: %f" %(labels[i],precision[i]))
    print("Recall of %s class: %f" %(labels[i],recall[i]))
    print("F1-Score of %s class: %f" %(labels[i],f1Score[i]),"\n")
clfData2.fit(Xtrain, y)

Fitting SVM to training data....

Overall Acurracy - SVM:  0.7228776312745778 

Precision of -1 class: 0.661803
Recall of -1 class: 0.673951
F1-Score of -1 class: 0.667822 

Precision of 0 class: 0.636457
Recall of 0 class: 0.608867
F1-Score of 0 class: 0.622356 

Precision of 1 class: 0.793781
Recall of 1 class: 0.802218
F1-Score of 1 class: 0.797977 



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='crammer_singer', penalty='l2', random_state=0,
     tol=0.0001, verbose=0)

# Testing on Data1

In [14]:
# df = pd.read_csv(data,index_col=0)
df1test = pd.read_csv("Data-1_test.csv",index_col=0)
df1test[:5]
idno = df1test.index.values
sentence1 = df1test[' text']
aspectTerm1 = df1test[' aspect_term']
print(sentence1.shape)

sentenceCopy1 = dataCleaning(sentence1)

inputSentenceWords1 = removeStopwords(sentenceCopy1)

print(aspectTerm1.shape)
print(len(inputSentenceWords1))

print(sentenceCopy1[6])

print(inputSentenceWords1[6])

finalInputSentence1 = convertListofwords(inputSentenceWords1)

print(finalInputSentence1[6])


matasarr = np.asarray(lexicon(inputSentenceWords1))

testDataVecs1 = idfmode11.transform(finalInputSentence1)
tempMatrix = testDataVecs1.toarray()
print(tempMatrix.shape)

Xtest1 = np.zeros((tempMatrix.shape[0],tempMatrix.shape[1]+1),dtype=float)

for i in range(matasarr.shape[0]):
    Xtest1[i] = np.hstack((tempMatrix[i],matasarr[i]))
print(Xtest1.shape)

(638,)
(638,)
638
chatting with acer support i was advised the problem was corrupted operating system files
['chatting', 'acer', 'support', 'advised', 'problem', 'corrupted', 'operating', 'system', 'files']
chatting acer support advised problem corrupted operating system files
(638, 31782)
(638, 31783)


In [15]:
# predict on SVM
preds = clfData1.predict(Xtest1)
write(idno,preds,1)

0 count:  51
1 count:  373
-1 count:  214


# Testing on Data2

In [16]:
df2test = pd.read_csv("Data-2_test.csv",index_col=0)
df2test[:5]
idno2 = df2test.index.values
sentence2 = df2test[' text']
aspectTerm2 = df2test[' aspect_term']
print(sentence2.shape)

sentenceCopy2 = dataCleaning(sentence2)

inputSentenceWords2 = removeStopwords(sentenceCopy2)

print(aspectTerm2.shape)
print(len(inputSentenceWords2))

print(sentenceCopy2[6])

print(inputSentenceWords2[6])

finalInputSentence2 = convertListofwords(inputSentenceWords2)

print(finalInputSentence2[6])


matasarr = np.asarray(lexicon(inputSentenceWords2))

testDataVecs2 = idfmode12.transform(finalInputSentence2)
tempMatrix = testDataVecs2.toarray()
print(tempMatrix.shape)

Xtest2 = np.zeros((tempMatrix.shape[0],tempMatrix.shape[1]+1),dtype=float)

for i in range(matasarr.shape[0]):
    Xtest2[i] = np.hstack((tempMatrix[i],matasarr[i]))
print(Xtest2.shape)

(1120,)
(1120,)
1120
food is excellent and they also have empenadas and plaintains which are good for an afternoon snack
['food', 'excellent', 'also', 'empenadas', 'plaintains', 'good', 'afternoon', 'snack']
food excellent also empenadas plaintains good afternoon snack
(1120, 26932)
(1120, 26933)


In [17]:
# predict on SVM
preds = clfData2.predict(Xtest2)
write(idno2,preds,2)

0 count:  65
1 count:  834
-1 count:  221
