#Author :Pankaj D Gaikwad


# Movie Review Example

In [1]:

import nltk,random

In [2]:
from nltk.corpus import  movie_reviews

In [3]:
data=[(list(movie_reviews.words(fileid)),category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

#Preprocessing on dataset


In [4]:
from nltk.corpus import stopwords

In [86]:
import string
from itertools import chain

from nltk.corpus import movie_reviews as mr
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc

stop=list(set(stopwords.words('english')))
#stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]


In [8]:
random.shuffle(documents)
train=documents[:1500]
test=documents[1500:]

In [70]:
vocabulary=[]
for i in range(0,len(train)):
    vocabulary.extend(train[i][0])

In [71]:
print(len(vocabulary))
vocabulary=list(set(vocabulary))
vocabulary.sort()
print(len(vocabulary))

535214
35468


#Feature Extraction

## Preparing unigram feature vectror based on presence/absence of words in vocabulary


In [115]:
def get_unigram_features(data,vocab):
    fet_vec_all = []
    for tup in data:
        single_feat_vec = []
        sent = tup[0][0].lower() #lowercasing the dataset
        for v in vocab:
            if sent.__contains__(v):
                single_feat_vec.append(1)
            else:
                single_feat_vec.append(0)
        fet_vec_all.append(single_feat_vec)
    return fet_vec_all

## Add sentiment scores from sentiwordnet, here we take the average sentiment scores of all word

In [162]:
from nltk.corpus import sentiwordnet as swn

In [163]:
def get_senti_wordnet_features(data):
    fet_vec_all = []
    for tup in data:
        words = tup[0][0].lower()
        #words = sent.split()
        pos_score = 0
        neg_score = 0
        for w in words:
            senti_synsets = swn.senti_synsets(w.lower())
            for senti_synset in senti_synsets:
                p = senti_synset.pos_score()
                n = senti_synset.neg_score()
                pos_score+=p
                neg_score+=n
                break #take only the first synset (Most frequent sense)
        fet_vec_all.append([float(pos_score),float(neg_score)])
    return fet_vec_all

In [166]:
#Merge two scores
def merge_features(featureList1,featureList2):
    # For merging two features
    if featureList1==[]:
        return featureList2
    merged = []
    for i in range(len(featureList1)):
        m = featureList1[i]+featureList2[i]
        merged.append(m)
    return merged

In [167]:
#extract the sentiment labels by making positive reviews as class 1 and negative reviews as class 2
def get_lables(data):
    labels = []
    for tup in data:
        if tup[1].lower()=="neg":
            labels.append(-1)
        else:
            labels.append(1)
    return labels

In [168]:
def calculate_precision(prediction, actual):
    prediction = list(prediction)
    correct_labels = [predictions[i]  for i in range(len(predictions)) if actual[i] == predictions[i]]
    precision = float(len(correct_labels))/float(len(prediction))
    return precision

In [169]:
training_unigram_features = get_unigram_features(train,vocabulary) # vocabulary extracted in the beginning
training_swn_features = get_senti_wordnet_features(train)

training_features = merge_features(training_unigram_features,training_swn_features)

training_labels = get_lables(train)

test_unigram_features = get_unigram_features(test,vocabulary)
test_swn_features=get_senti_wordnet_features(test)
test_features= merge_features(test_unigram_features,test_swn_features)

test_gold_labels = get_lables(test)

In [170]:
# SVM Classifier
#Refer to : http://scikit-learn.org/stable/modules/svm.html
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC(penalty='l2', C=0.01).fit(training_features,training_labels)
predictions = svm_classifier.predict(training_features)

print("Precision of linear SVM classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = svm_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of linear SVM classifier is:
Training data	0.6993333333333334
Test data	0.528


In [181]:
#logistic Regression
from sklearn import linear_model
Logreg=linear_model.LogisticRegression(C=1e5)
LogClassifier=Logreg.fit(training_features,training_labels)
predictions = LogClassifier.predict(training_features)

print("Precision of linear Logistic Regression is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = LogClassifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of linear Logistic Regression is:
Training data	0.8386666666666667
Test data	0.54


In [184]:
#Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
clf=GaussianNB()
NBClassfier=clf.fit(training_features,training_labels)
predictions = NBClassfier.predict(training_features)

print("Precision of linear Naive Bayes Regression is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = NBClassfier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))


Precision of linear Naive Bayes Regression is:
Training data	0.78
Test data	0.522


In [188]:
#Deicision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier()
DTClassfier=clf.fit(training_features,training_labels)
predictions = DTClassfier.predict(training_features)

print("Precision of  Decisoin Tree Classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = DTClassfier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))

Precision of  Decisoin Tree Classifier is:
Training data	0.8386666666666667
Test data	0.562


In [189]:
#Vader Analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()



In [208]:
#list of movie reviews
sents=[]
for i in range(0,len(documents)):
    sents.append(' '.join(documents[i][0]))


In [229]:
#Vader on Training Data
pol=[]
for i in range(0,len(train)):
    pol.append(sid.polarity_scores(' '.join(train[i][0])))
    
    

In [246]:
labels_train=[]
for i in range(0,len(pol)):
    if pol[i]['pos'] >= 0.5:
        lables_train.append(1)
    else:
        labels_train.append(-1)
               

In [232]:
#Vader on Test data
polt=[]
for i in range(0,len(test)):
    polt.append(sid.polarity_scores(' '.join(test[i][0])))

In [239]:
labels=[]
for i in range(0,len(polt)):
    if polt[i]['pos'] >= 0.5:
        lables.append(1)
    else:
        labels.append(-1)
               
                     

In [252]:
acc_test=calculate_precision(labels,test_gold_labels)

In [253]:
acc_train=calculate_precision(labels_train,training_labels)

In [257]:
print('Accracy of vader analysis on training data',':',acc_train)
print('Accracy of vader analysis on test data',':',acc_test)

Accracy of vader analysis on training data : 0.168
Accracy of vader analysis on test data : 0.562


# Twitter samples example

In [4]:
#http://www.nltk.org/_modules/nltk/sentiment/util.html
#http://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost
import nltk

In [2]:
from nltk.corpus import twitter_samples

In [7]:
from nltk.corpus import stopwords
stop=list(set(stopwords.words('English')))

In [None]:
import re
from nltk.tokenize import word_tokenize

In [24]:
categorized_tweets = ([(t, "pos") for t in twitter_samples.strings("positive_tweets.json")] +
                            [(t, "neg") for t in twitter_samples.strings("negative_tweets.json")])


smilies = [':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3', ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';(', '(', ')', 'via']


'''categorized_tweets_tokens = []
for tweet in categorized_tweets:
    text = tweet[0]
    for smiley in smilies:
        text = re.sub(re.escape(smiley), '', text)
    categorized_tweets_tokens.append((word_tokenize(text), tweet[1]))
'''

In [None]:
import re
def processTweet(tweet):
    # process the tweets

    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    #remove numbers
    tweet=re.sub('[0-9]','',tweet)
    return tweet

#end

In [134]:
for i in range(0,len(categorized_tweets)):
    categorized_tweets_remove=processTweet(categorized_tweets[i][0])

In [137]:
categorized_tweets = ([[t, "pos"] for t in twitter_samples.strings("positive_tweets.json")] +
                            [[t, "neg"] for t in twitter_samples.strings("negative_tweets.json")])

In [148]:
clean_tweets=[]
for i in range(0,len(categorized_tweets)):
    clean_tweets.append([processTweet(categorized_tweets[i][0]),categorized_tweets[i][1]])

In [168]:
from nltk import word_tokenize

In [179]:
vocabulary=[w.lower() for i in range(0,len(clean_tweets)) for w in word_tokenize(clean_tweets[i][0]) if w.lower() not in stop and w.lower() not in smilies]
vocabulary=list(set(vocabulary))

In [181]:
vocabulary.sort()

In [183]:
random.shuffle(clean_tweets)

In [187]:
train=clean_tweets[:7000]
test=clean_tweets[7000:]

In [188]:
def get_unigram_features(data,vocab):
    fet_vec_all = []
    for tup in data:
        single_feat_vec = []
        sent = tup[0].lower() #lowercasing the dataset
        for v in vocab:
            if sent.__contains__(v):
                single_feat_vec.append(1)
            else:
                single_feat_vec.append(0)
        fet_vec_all.append(single_feat_vec)
    return fet_vec_all

In [195]:
from nltk.corpus import sentiwordnet as swn

In [190]:
def get_senti_wordnet_features(data):
    fet_vec_all = []
    for tup in data:
        sent = tup[0].lower()
        words = sent.split()
        pos_score = 0
        neg_score = 0
        for w in words:
            senti_synsets = swn.senti_synsets(w.lower())
            for senti_synset in senti_synsets:
                p = senti_synset.pos_score()
                n = senti_synset.neg_score()
                pos_score+=p
                neg_score+=n
                break #take only the first synset (Most frequent sense)
        fet_vec_all.append([float(pos_score),float(neg_score)])
    return fet_vec_all

In [191]:
def merge_features(featureList1,featureList2):
    # For merging two features
    if featureList1==[]:
        return featureList2
    merged = []
    for i in range(len(featureList1)):
        m = featureList1[i]+featureList2[i]
        merged.append(m)
    return merged

In [192]:
def get_lables(data):
    labels = []
    for tup in data:
        if tup[1].lower()=="neg":
            labels.append(-1)
        else:
            labels.append(1)
    return labels

In [193]:
def calculate_precision(prediction, actual):
    prediction = list(prediction)
    correct_labels = [predictions[i]  for i in range(len(predictions)) if actual[i] == predictions[i]]
    precision = float(len(correct_labels))/float(len(prediction))
    return precision

In [196]:
training_unigram_features = get_unigram_features(train,vocabulary) # vocabulary extracted in the beginning
training_swn_features = get_senti_wordnet_features(train)

training_features = merge_features(training_unigram_features,training_swn_features)

training_labels = get_lables(train)

test_unigram_features = get_unigram_features(test,vocabulary)
test_swn_features=get_senti_wordnet_features(test)
test_features= merge_features(test_unigram_features,test_swn_features)

test_gold_labels = get_lables(test)

In [197]:
# SVM Classifier
#Refer to : http://scikit-learn.org/stable/modules/svm.html
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC(penalty='l2', C=0.01).fit(training_features,training_labels)
predictions = svm_classifier.predict(training_features)

print("Precision of linear SVM classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = svm_classifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))


Precision of linear SVM classifier is:
Training data	0.8275714285714286
Test data	0.7636666666666667


In [198]:

#logistic Regression
from sklearn import linear_model
Logreg=linear_model.LogisticRegression(C=1e5)
LogClassifier=Logreg.fit(training_features,training_labels)
predictions = LogClassifier.predict(training_features)

print("Precision of  Logistic Regression is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = LogClassifier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))


Precision of linear Logistic Regression is:
Training data	0.993
Test data	0.6633333333333333


In [199]:
#Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
clf=GaussianNB()
NBClassfier=clf.fit(training_features,training_labels)
predictions = NBClassfier.predict(training_features)
print("Precision of  Naive Bayes Regression is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = NBClassfier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))


Precision of  Naive Bayes Regression is:
Training data	0.8124285714285714
Test data	0.5936666666666667


In [200]:
#Deicision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier()
DTClassfier=clf.fit(training_features,training_labels)
predictions = DTClassfier.predict(training_features)

print("Precision of  Decision Tree Classifier is:")
precision = calculate_precision(predictions,training_labels)
print("Training data\t" + str(precision))
predictions = DTClassfier.predict(test_features)
precision = calculate_precision(predictions,test_gold_labels)
print("Test data\t" + str(precision))


Precision of  Decision Tree Classifier is:
Training data	0.9977142857142857
Test data	0.6776666666666666


In [201]:
#Vader Analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()




In [202]:
#list of tweets
sents=[]
for i in range(0,len(clean_tweets)):
    sents.append(' '.join(clean_tweets[i][0]))

#Vader on Training Data
pol=[]
for i in range(0,len(train)):
    pol.append(sid.polarity_scores(' '.join(train[i][0])))


labels_train=[]
for i in range(0,len(pol)):
    if pol[i]['pos'] >= 0.5:
        lables_train.append(1)
    else:
        labels_train.append(-1)

#Vader on Test data
polt=[]
for i in range(0,len(test)):
    polt.append(sid.polarity_scores(' '.join(test[i][0])))

labels=[]
for i in range(0,len(polt)):
    if polt[i]['pos'] >= 0.5:
        lables.append(1)
    else:
        labels.append(-1)
               
acc_test=calculate_precision(labels,test_gold_labels)
acc_train=calculate_precision(labels_train,training_labels)
print('Accracy of vader analysis on training data',':',acc_train)
print('Accracy of vader analysis on test data',':',acc_test)


Accracy of vader analysis on training data : 0.21842857142857142
Accracy of vader analysis on test data : 0.6776666666666666


# OUTPUT


In [None]:
'''
Datasets        Naïve Bayes    SVM    Decision Tree   Logistic Regression     Vader Sentiment Analysis
Movie_Reviews      52.2         52.8       56.2                 54                    56.2
Twitter_Samples    59.37       76.37        67.77              66.33                   67.77
'''