In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

## Lexicon labelling

Label review based on [lexicon](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm).

In [None]:
lexicon = pd.read_csv("NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt",
                      sep='\t', names=['Word', 'Emotion', 'Yes'])
review = pd.read_excel('XunWei Data Annotation.xlsx')

sentiment = ['negative', 'positive']
emotion = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']

In [None]:
def generateLexi(col):
    df = lexicon[(lexicon['Emotion'] == col) & (lexicon['Yes'] == 1)]
    return df

def compareSenti(word, label=sentiment):
    for j in label:
        lst = generateLexi(j)['Word']
    
        for i in lst:
            if " " + i.strip() + " " in word:
                return j
            
def compareEmo(word, label=emotion):
    for j in label:
        lst = generateLexi(j)['Word']
    
        for i in lst:
            if " " + i.strip() + " " in word:
                return j

In [None]:
review['Sentiment'] = review['Review'].apply(compareSenti)
review['Emotions'] = review['Review'].apply(compareEmo)

In [None]:
review.head(30)

In [None]:
review = review.sort_values(by=['Emotions'])

In [None]:
review.to_excel('categorised1.xlsx')

In [None]:
review[review['Sentiment'] == 'positive']

## Model

https://www.kaggle.com/oumaimahourrane/sentiment-analysis-ml-models-comparison

Automate labelling by building a model.

In [15]:
data = pd.read_excel('predicted.xlsx')

# data = data.rename(columns = {'Unnamed: 0': 'Number'})
data.drop("Unnamed: 0", axis=1, inplace=True)
# data = data[['Number', 'Review ID', 'Review', 'Sentiment', 'Emotions', 'Unnamed: 6']]
data.head()

Unnamed: 0,Number,Review ID,Review,Sentiment,Emotions,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,1309,R2910,This is No #1 useless scam app in the world. I...,negative,anger,,,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
1,729,R2330,Easy to use. But the most important thing boos...,positive,joy,,,positive,negative,neutral,,,,,
2,951,R2552,I'm recently opted for secure2u for transactio...,negative,sadness,,,,,,,,,,
3,497,R498,what happen to boost app??? I can't log in and...,negative,fear,,,,,,,,,,
4,732,R2333,Worst apps I ever seen. When you are pay toll ...,negative,anger,,,,,,,,,,


In [10]:
data[data['Unnamed: 6'] == 12]

Unnamed: 0,Number,Review ID,Review,Sentiment,Emotions,Unnamed: 6
727,9,R010,Only can use at certain tols surrounding KL ci...,negative,anger,12


In [52]:
current_x = 727

train_senti = data[['Review', 'Sentiment']].loc[:current_x]
train_senti.tail()

Unnamed: 0,Review,Sentiment
723,the best bank service ever,positive
724,"I register new account, the apps show this num...",negative
725,Wish that I could perform Tabung Haji transact...,negative
726,"Needed more merchants to participate, such as ...",negative
727,Only can use at certain tols surrounding KL ci...,negative


In [39]:
train_senti = data[['Review', 'Sentiment']].loc[:current_x]
train_emo = data[['Review', 'Emotions']].loc[:current_x]
train_emo['Emotions'].fillna('neutral', inplace=True)
train_senti['Sentiment'].fillna('neutral', inplace=True)
# train.drop(train[train.Sentiment == 'neutral'].index, axis=0, inplace=True)

### Wordcloud

In [6]:
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

def plot_cloud(train, label):
    label_tweets = train[train.iloc[:, 1] == label]
    label_string = []
    for t in label_tweets.Review:
        label_string.append(t)
    label_string = pd.Series(label_string).str.cat(sep=' ')
    from wordcloud import WordCloud

    wordcloud = WordCloud(width=1600, height=600,max_font_size=200, background_color='white').generate(label_string)
    plt.figure(figsize=(12,10))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    
plot_cloud(train_emo, 'anger')

### Build model

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
SEED = 2000

In [41]:
def preprocess(label):
    if label == 'senti':
        lb = LabelBinarizer()
        x_train, x_validation, y_train, y_validation = train_test_split(
            train_senti.Review,lb.fit_transform(train_senti[['Sentiment']]), test_size=.2, random_state=SEED)
    elif label == 'emo':
        lb = LabelBinarizer()
        x_train, x_validation, y_train, y_validation = train_test_split(
            train_emo.Review,lb.fit_transform(train_emo[['Emotions']]), test_size=.2, random_state=SEED)
    return lb, x_train, x_validation, y_train, y_validation

### Prediction

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Multi class
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

vec = TfidfVectorizer()

In [23]:
def train_model(x_train, y_train):
    model = KNeighborsClassifier()
    pipeline = Pipeline([
                ('vectorizer', vec),
                ('classifier', model)
            ])
    pipeline.fit(x_train, y_train)
    return pipeline

def predict_label(review):
    return lb.inverse_transform(pipeline.predict(review))

In [48]:
work = data.iloc[current_x -1:]

# sentiment
lb, x_train, x_validation, y_train, y_validation = preprocess('senti')
pipeline = train_model(x_train, y_train)
work['Sentiment'] = work[['Review']].apply(predict_label)

# emotion
lb, x_train, x_validation, y_train, y_validation = preprocess('emo')
pipeline = train_model(x_train, y_train)
work['Emotions'] = work[['Review']].apply(predict_label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [50]:
work[work['Sentiment'] == 'negative'].head(3)

Unnamed: 0,Number,Review ID,Review,Sentiment,Emotions,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
726,276,R277,"Needed more merchants to participate, such as ...",negative,anger,,,,,,,,,,
727,9,R010,Only can use at certain tols surrounding KL ci...,negative,anger,12.0,,,,,,,,,
728,51,R052,why it says im already registered my account b...,negative,anger,,,,,,,,,,


In [49]:
work[work['Emotions'] == 'anticipation'].head(3)

Unnamed: 0,Number,Review ID,Review,Sentiment,Emotions,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
747,217,R218,Please include the function for us to authenti...,negative,anticipation,,,,,,,,,,
828,57,R058,"After updated, your mission page can't get in!...",negative,anticipation,,,,,,,,,,
864,160,R161,Perfect! Please keep it as minimal as it is now.,negative,anticipation,,,,,,,,,,


In [51]:
result = pd.concat([data.loc[:current_x], work])
result.to_excel('predicted2.xlsx')

### Match data

Match original data with annotated data.

In [11]:
ori = pd.read_excel('XunWei Data Annotation.xlsx')
ori = ori[['Review ID', 'Review']]

annotated = pd.read_excel('predicted.xlsx')
annotated = annotated[['Review ID', 'Sentiment', 'Emotions']]

In [12]:
ori.head(2)

Unnamed: 0,Review ID,Review
0,R001,so helpful..
1,R002,Not functioning at all after upgrading/update....


In [14]:
merged = pd.merge(ori, annotated, on='Review ID', how='left')

merged['Emotions'].fillna('neutral', inplace=True)
merged['Sentiment'].fillna('neutral', inplace=True)

In [17]:
merged.head()

Unnamed: 0,Review ID,Review,Sentiment,Emotions
0,R001,so helpful..,neutral,neutral
1,R002,Not functioning at all after upgrading/update....,neutral,neutral
2,R003,This is the biggest SCAM in this country at th...,negative,anger
3,R004,horrible apps. took me such a long time to loa...,negative,anger
4,R005,too frequent update version. troublesome,negative,anger


In [19]:
merged.to_excel('Xun Wei annotated.xlsx', index=False)

### Determine the best k

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
Sum_of_squared_distances = []
K = range(1,30)
for k in K:
    km = KNeighborsClassifier(n_neighbors=k)
    pipeline = Pipeline([
            ('vectorizer', vec),
            ('classifier', model)
        ])
    pipeline.fit(x_train, y_train)
    pred = pipeline.predict(x_validation)
    w = accuracy_score(pred, y_validation)
    Sum_of_squared_distances.append(w)

In [None]:
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

#### Features extraction

In [None]:
# Binary class
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier, PassiveAggressiveClassifier, Perceptron, LogisticRegression
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import accuracy_score, precision_recall_curve, average_precision_score, roc_curve, auc
import numpy as np
from time import time

In [None]:
names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
classifiers = [
    LogisticRegression(),
    LinearSVC(),
    Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
    ]
zipped_clf = zip(names,classifiers)

vec = TfidfVectorizer()

In [None]:
def acc_summary(pipeline, x_train, y_train, x_test, y_test):
    
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
  
    #Compute the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    #Compute the precision and recall
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    #Compute the average precision
    average_precision = average_precision_score(y_test, y_pred)
  
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))
    print ("accuracy score: {0:.2f}%".format(accuracy*100))
    print ("-"*80)
    return accuracy, precision, recall, average_precision, fpr, tpr

In [None]:
def classifier_comparator(vectorizer=vec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        print ("Validation result for {}".format(n))
        print (c)
        clf_acc, prec, rec, avg, fp, tp = acc_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
        result.append((n,clf_acc,prec, rec, avg, fp, tp))
    return result

result = classifier_comparator(n_features=100000,ngram_range=(1,3))