In [1]:
import pandas as pd 
import re
import nltk
from nltk.corpus import stopwords,wordnet
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import spacy
from spacy import displacy

<h2>
Load data as pandas dataframe.
</h2>

In [2]:
df_amazon = pd.read_csv('./datasets/archive/amazon_cells_labelled.txt',sep='\t',header=None)
df_imdb = pd.read_csv('./datasets/archive/imdb_labelled.txt',sep='\t',header=None)
df_yelp = pd.read_csv('./datasets/archive/yelp_labelled.txt',sep='\t',header=None)

print('df_amazon shape: \t',df_amazon.shape)
print('df_imdb shape: \t',df_imdb.shape)
print('df_yelp shape: \t',df_yelp.shape)

df_amazon shape: 	 (1000, 2)
df_imdb shape: 	 (748, 2)
df_yelp shape: 	 (1000, 2)


In [3]:
col_names = ['reviews','sentiment'] # new column names
df_amazon.columns = df_imdb.columns = df_yelp.columns = col_names   # changing column names

In [4]:
df_amazon.head()

Unnamed: 0,reviews,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [5]:
df_imdb.head()

Unnamed: 0,reviews,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [6]:
df_yelp.head()

Unnamed: 0,reviews,sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [7]:
df = pd.concat([df_amazon,df_imdb,df_yelp], ignore_index=True)  # Combine all dataset into one dataset
print('df shape: \t',df.shape)

df shape: 	 (2748, 2)


In [8]:
df = df.sample(frac=1)  # Shuffle the dataframe
df.head()

Unnamed: 0,reviews,sentiment
1776,Took an hour to get our food only 4 tables in ...,0
1926,"Not a weekly haunt, but definitely a place to ...",1
154,I've bought $5 wired headphones that sound bet...,0
870,Works fine.,1
1585,But the duet between the astronaut and his doc...,1


<h2>
Remove punctuations, special characters and stopwords from the text column. Convert the text to lower case.
</h2>

In [9]:
sw = stopwords.words('english') # list of stopwords
print("stopwords:\n",sw)

stopwords:
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so

In [10]:
def text_cleaning(reviews):
    result = []
    punc_tokenizer = nltk.RegexpTokenizer(r"\w+") # punctuations tokenizer.

    for review in reviews:
        text = review.lower()   # Convert the text to lower case.
        text = punc_tokenizer.tokenize(text)    # Removing punctuations, special characters.
        text = [word for word in text if word not in sw] # Removing stopwords

        text = " ".join(text) 
        result.append(text)
    
    return result


In [11]:
print(text_cleaning(df.reviews[0:4]))

['took hour get food 4 tables restaurant food luke warm sever running around like totally overwhelmed', 'weekly haunt definitely place come back every', 'bought 5 wired headphones sound better', 'works fine']


<h3>
Create two objects X and y. X will be the 'text' column dataframe and y will be the “class” column. create a CountVectorizer object and split the data into training and testing sets. Train a MultinomialNB model and Display the confusion Matrix
</h3>

In [12]:
X = df.reviews
y = df.sentiment

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=7)  # Train Test Split

In [14]:
print('X_train shape: \t',X_train.shape)
print('X_test shape: \t',X_test.shape)

X_train shape: 	 (2198,)
X_test shape: 	 (550,)


In [15]:
count_vec = CountVectorizer()   # CountVectorizer object

X_train = count_vec.fit_transform(X_train)
X_test = count_vec.transform(X_test)

In [16]:
model  = MultinomialNB()    # Multinomial Model
model.fit(X_train,y_train)

MultinomialNB()

In [17]:
y_pred = model.predict(X_test)

print('accuracy: \t', accuracy_score(y_test,y_pred))
print('f1_score: \t', f1_score(y_test,y_pred))

accuracy: 	 0.8345454545454546
f1_score: 	 0.8239845261121858


In [18]:
print(confusion_matrix(y_test,y_pred)) # Confusion Matrix

[[246  47]
 [ 44 213]]


<h2>
Display the POS tagging on the first 4 rows of ‘text’ 
</h2>

In [19]:
def get_pos_tag(reviews):
    for review in reviews:
        print('REVIEW TEXT: ',review, end='\n\n')
        punc_tokenizer = nltk.RegexpTokenizer(r"\w+")
        review = punc_tokenizer.tokenize(review)
        print('POS TAG: ',nltk.pos_tag(review),sep='\t',end='\n\n\n\n')


In [20]:
get_pos_tag(df.reviews[0:4])

REVIEW TEXT:  Took an hour to get our food only 4 tables in restaurant my food was Luke warm, Our sever was running around like he was totally overwhelmed.

POS TAG: 	[('Took', 'NNP'), ('an', 'DT'), ('hour', 'NN'), ('to', 'TO'), ('get', 'VB'), ('our', 'PRP$'), ('food', 'NN'), ('only', 'RB'), ('4', 'CD'), ('tables', 'NNS'), ('in', 'IN'), ('restaurant', 'NN'), ('my', 'PRP$'), ('food', 'NN'), ('was', 'VBD'), ('Luke', 'NNP'), ('warm', 'JJ'), ('Our', 'PRP$'), ('sever', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('around', 'RB'), ('like', 'IN'), ('he', 'PRP'), ('was', 'VBD'), ('totally', 'RB'), ('overwhelmed', 'JJ')]



REVIEW TEXT:  Not a weekly haunt, but definitely a place to come back to every once in a while.

POS TAG: 	[('Not', 'RB'), ('a', 'DT'), ('weekly', 'JJ'), ('haunt', 'NN'), ('but', 'CC'), ('definitely', 'RB'), ('a', 'DT'), ('place', 'NN'), ('to', 'TO'), ('come', 'VB'), ('back', 'RB'), ('to', 'TO'), ('every', 'DT'), ('once', 'RB'), ('in', 'IN'), ('a', 'DT'), ('while', 'NN')]





<h2>
Build and display a dependency parser tree
</h2>

In [21]:
nlp = spacy.load('en_core_web_sm')

In [22]:
text = "He sends down flaming oil barrels and ultimately delivers the blow that sends the wildlings retreating by swinging a huge scythe across the Wall, causing the ice to break and sending many falling to their deaths."

displacy.render(nlp(text),jupyter=True)