# Natural Language Processing

# Part 1 - Using the TextBlob Sentiment Analyzer

In [66]:
import pandas as pd
import numpy as np
import nltk

In [67]:
#load dataframe
df = pd.read_csv('labeledTrainData.tsv',sep = '\t')

In [68]:
#view df to ensure loaded correctly
df.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [69]:
#install textblob
! pip install -U textblob



In [70]:
from textblob import TextBlob

#2

In [71]:
#find counts of negative and positive reviews
df.groupby('sentiment').count()

Unnamed: 0_level_0,id,review
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12500,12500
1,12500,12500


#3

In [72]:
#create functions to calculate polarity and subjectivity

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [73]:
#create new columns for subjectivity, polarity and Analysis 
df['subjectivity'] = df['review'].apply(getSubjectivity)
df['polarity'] = df['review'].apply(getPolarity)

def getAnalysis(score):
    if score < 0:
        return 'Negative'
    else:
        return 'Positive'
     
df['Analysis'] = df['polarity'].apply(getAnalysis)

In [74]:
df.head(10)

Unnamed: 0,id,sentiment,review,subjectivity,polarity,Analysis
0,5814_8,1,With all this stuff going down at the moment w...,0.606746,0.001277,Positive
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.531111,0.256349,Positive
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0.562933,-0.053941,Negative
3,3630_4,0,It must be assumed that those who praised this...,0.492901,0.134753,Positive
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0.459818,-0.024842,Negative
5,8196_8,1,I dont know why people think this is such a ba...,0.664052,0.105882,Positive
6,7166_2,0,"This movie could have been very good, but come...",0.559464,-0.027054,Negative
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.530556,0.06875,Positive
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.45119,0.09881,Positive
9,8713_10,1,<br /><br />This movie is full of references. ...,0.409722,0.258333,Positive


#4

In [75]:
#create numeric function for analysis to use when checking accuracy, create a column to hold it
def getAnalysisNumeric(score):
    if score < 0:
        return 0
    else:
        return 1

df['Numeric Analysis'] = df['polarity'].apply(getAnalysisNumeric)

In [76]:
df.head(10)

Unnamed: 0,id,sentiment,review,subjectivity,polarity,Analysis,Numeric Analysis
0,5814_8,1,With all this stuff going down at the moment w...,0.606746,0.001277,Positive,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.531111,0.256349,Positive,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0.562933,-0.053941,Negative,0
3,3630_4,0,It must be assumed that those who praised this...,0.492901,0.134753,Positive,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0.459818,-0.024842,Negative,0
5,8196_8,1,I dont know why people think this is such a ba...,0.664052,0.105882,Positive,1
6,7166_2,0,"This movie could have been very good, but come...",0.559464,-0.027054,Negative,0
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.530556,0.06875,Positive,1
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.45119,0.09881,Positive,1
9,8713_10,1,<br /><br />This movie is full of references. ...,0.409722,0.258333,Positive,1


In [77]:
#create column for accuracy
df['accuracy'] = np.where(df['Numeric Analysis'] == df['sentiment'], 1,0)
df.head(10)

Unnamed: 0,id,sentiment,review,subjectivity,polarity,Analysis,Numeric Analysis,accuracy
0,5814_8,1,With all this stuff going down at the moment w...,0.606746,0.001277,Positive,1,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.531111,0.256349,Positive,1,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0.562933,-0.053941,Negative,0,1
3,3630_4,0,It must be assumed that those who praised this...,0.492901,0.134753,Positive,1,0
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0.459818,-0.024842,Negative,0,0
5,8196_8,1,I dont know why people think this is such a ba...,0.664052,0.105882,Positive,1,1
6,7166_2,0,"This movie could have been very good, but come...",0.559464,-0.027054,Negative,0,1
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.530556,0.06875,Positive,1,0
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.45119,0.09881,Positive,1,0
9,8713_10,1,<br /><br />This movie is full of references. ...,0.409722,0.258333,Positive,1,1


In [78]:
#calculate accuracy of model
df['accuracy'].mean() * 100

68.524

The model appears to be better than random guessing (50%) at 68.5%

# Using Vader

In [79]:
#create SentimentIntensity Analyzer object and function to generate scores
sid_obj = SentimentIntensityAnalyzer()

def sentiment_vader(text):
    return sid_obj.polarity_scores(text)['compound']

In [80]:
#create column of vader scores
df['vader_compound'] = df['review'].apply(sentiment_vader)

In [81]:
#create vader sentiment column dictating whether sentiment is postive or negative
df['vader_sentiment'] = np.where(df['vader_compound'] >= 0.05, 1, np.where(df['vader_compound'] <= -0.05, 0, 1))
df.head(10)

Unnamed: 0,id,sentiment,review,subjectivity,polarity,Analysis,Numeric Analysis,accuracy,vader_compound,vader_sentiment
0,5814_8,1,With all this stuff going down at the moment w...,0.606746,0.001277,Positive,1,1,-0.8879,0
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.531111,0.256349,Positive,1,1,0.9736,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0.562933,-0.053941,Negative,0,1,-0.9883,0
3,3630_4,0,It must be assumed that those who praised this...,0.492901,0.134753,Positive,1,0,-0.1202,0
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0.459818,-0.024842,Negative,0,0,0.6115,1
5,8196_8,1,I dont know why people think this is such a ba...,0.664052,0.105882,Positive,1,1,0.3935,1
6,7166_2,0,"This movie could have been very good, but come...",0.559464,-0.027054,Negative,0,1,-0.6863,0
7,10633_1,0,I watched this video at a friend's house. I'm ...,0.530556,0.06875,Positive,1,0,-0.4517,0
8,319_1,0,"A friend of mine bought this film for £1, and ...",0.45119,0.09881,Positive,1,0,0.9707,1
9,8713_10,1,<br /><br />This movie is full of references. ...,0.409722,0.258333,Positive,1,1,0.7184,1


In [82]:
#create a column to check for accurary
df['vader_accuracy'] = np.where(df['vader_sentiment'] == df['sentiment'], 1, 0)
df.head(5)

Unnamed: 0,id,sentiment,review,subjectivity,polarity,Analysis,Numeric Analysis,accuracy,vader_compound,vader_sentiment,vader_accuracy
0,5814_8,1,With all this stuff going down at the moment w...,0.606746,0.001277,Positive,1,1,-0.8879,0,0
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.531111,0.256349,Positive,1,1,0.9736,1,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,0.562933,-0.053941,Negative,0,1,-0.9883,0,1
3,3630_4,0,It must be assumed that those who praised this...,0.492901,0.134753,Positive,1,0,-0.1202,0,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,0.459818,-0.024842,Negative,0,0,0.6115,1,1


In [84]:
#calculate accuracy
df['vader_accuracy'].mean()

0.69224

At 69.2% the Vader model is slightly more accurate than TextBlob (68.5%) and is more accurate than guessing (50%)

In [85]:
# load in new df for simplicity
df_two = pd.read_csv('labeledTrainData.tsv',sep = '\t')
df_two.head(10)

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come..."
7,10633_1,0,I watched this video at a friend's house. I'm ...
8,319_1,0,"A friend of mine bought this film for £1, and ..."
9,8713_10,1,<br /><br />This movie is full of references. ...


In [86]:
#import regular expressions and create function to do text cleaning
import re

def clean_text(text):
    text = text.lower() #converts text to lowercase
    text = re.sub(r'[^\w\s\d]+','', text) #remove special characters

    return text

In [87]:
#create column for cleaned review
df_two['review_cleaned'] = df_two['review'].apply(clean_text)

In [88]:
df_two.head(10)

Unnamed: 0,id,sentiment,review,review_cleaned
0,5814_8,1,With all this stuff going down at the moment w...,with all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",the classic war of the worlds by timothy hines...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,the film starts with a manager nicholas bell g...
3,3630_4,0,It must be assumed that those who praised this...,it must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious 8...
5,8196_8,1,I dont know why people think this is such a ba...,i dont know why people think this is such a ba...
6,7166_2,0,"This movie could have been very good, but come...",this movie could have been very good but comes...
7,10633_1,0,I watched this video at a friend's house. I'm ...,i watched this video at a friends house im gla...
8,319_1,0,"A friend of mine bought this film for £1, and ...",a friend of mine bought this film for 1 and ev...
9,8713_10,1,<br /><br />This movie is full of references. ...,br br this movie is full of references like ma...


In [89]:
# load stopwords
from nltk.corpus import stopwords

In [90]:
# remove stop words
stop = stopwords.words('English')
df_two['review_stop'] = df_two['review_cleaned'].apply(lambda x:' '.join([word for word in x.split() if word not in (stop)]))
df_two.head(5)

Unnamed: 0,id,sentiment,review,review_cleaned,review_stop
0,5814_8,1,With all this stuff going down at the moment w...,with all this stuff going down at the moment w...,stuff going moment mj ive started listening mu...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",the classic war of the worlds by timothy hines...,classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,the film starts with a manager nicholas bell g...,film starts manager nicholas bell giving welco...
3,3630_4,0,It must be assumed that those who praised this...,it must be assumed that those who praised this...,must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious 80s e...


#4

In [91]:
#import tokenizer and PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

In [92]:
#create Porter variable
Porter = PorterStemmer()

In [93]:
#create df columns for tokenized and stemmed
df_two['review_tokenized'] = df_two['review_stop'].apply(lambda x: filter(None,x.split()))
df_two['review_stemmed'] = df_two['review_tokenized'].apply(lambda x: [Porter.stem(y) for y in x])
df_two['review_stem_sentence'] = df_two['review_stemmed'].apply(lambda x:" ".join(x))
df_two.head(5)

Unnamed: 0,id,sentiment,review,review_cleaned,review_stop,review_tokenized,review_stemmed,review_stem_sentence
0,5814_8,1,With all this stuff going down at the moment w...,with all this stuff going down at the moment w...,stuff going moment mj ive started listening mu...,<filter object at 0x000001A2A131F610>,"[stuff, go, moment, mj, ive, start, listen, mu...",stuff go moment mj ive start listen music watc...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",the classic war of the worlds by timothy hines...,classic war worlds timothy hines entertaining ...,<filter object at 0x000001A2C7D00580>,"[classic, war, world, timothi, hine, entertain...",classic war world timothi hine entertain film ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,the film starts with a manager nicholas bell g...,film starts manager nicholas bell giving welco...,<filter object at 0x000001A2C7D00DC0>,"[film, start, manag, nichola, bell, give, welc...",film start manag nichola bell give welcom inve...
3,3630_4,0,It must be assumed that those who praised this...,it must be assumed that those who praised this...,must assumed praised film greatest filmed oper...,<filter object at 0x000001A2C7D00460>,"[must, assum, prais, film, greatest, film, ope...",must assum prais film greatest film opera ever...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious 80s e...,<filter object at 0x000001A2C7D00070>,"[superbl, trashi, wondrous, unpretenti, 80, ex...",superbl trashi wondrous unpretenti 80 exploit ...


In [94]:
#create BOW
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bag_of_words = count.fit_transform(df_two['review_stem_sentence'])

In [95]:
#BOW dimensions
bag_of_words.shape

(25000, 92532)

In [96]:
#create Tfidf Matrix and show dimensions
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(df_two['review_stem_sentence'])
feature_matrix.shape

(25000, 92532)