<a href="https://colab.research.google.com/github/miacarroll1207/NLP-Practice/blob/main/Movie_and_Amazon_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#load data with pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')


In [None]:
data_yelp = pd.read_csv('yelp_labelled.txt', sep='\t',header=None)

In [None]:
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
#split into columns with names
column_names = ['Review', 'Sentiment']
data_yelp.columns = column_names

In [None]:
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
data_yelp.shape

(1000, 2)

In [None]:
#load amazon data
data_amazon = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)

In [None]:
data_amazon.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [None]:
data_amazon.columns = column_names

In [None]:
data_amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [None]:
data_amazon.shape

(1000, 2)

In [None]:
#now load imdb
data_imdb = pd.read_csv('imdb_labelled.txt', sep='\t', header=None)

In [None]:
data_imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
data_imdb.columns = column_names

In [None]:
data_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
data_imdb.shape

(748, 2)

In [None]:
# now merge all of the data
data = pd.concat([data_amazon, data_imdb, data_yelp], ignore_index=True)

In [None]:
data.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [None]:
data.shape

(2748, 2)

In [None]:
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [None]:
x = data['Review']
y = data['Sentiment']

In [None]:
#clean data
#remove stop words and punctuation
#do lemmatization
import string
punc = string.punctuation


In [None]:
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
stopwords = list(STOP_WORDS)

In [None]:
#data cleaning function
def clean_data(sentence):
  doc = nlp(sentence)

  tokens = []
  for token in doc:
    if token.lemma != "-PRON-":
      temp = token.lemma_.lower().strip()
    else:
      temp = token.lower_
    tokens.append(temp)
    cleaned_tokens = []
  for token in tokens:
    if token not in stopwords and token not in punc:
      cleaned_tokens.append(token)
  return cleaned_tokens

In [None]:
clean_data("Hello all, It's a beautiful day outside there!")


['hello', 'beautiful', 'day', 'outside']

In [None]:
#TF IDF
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [None]:
tokenizer = TfidfVectorizer(tokenizer=clean_data)

In [None]:
classifier = LinearSVC()

In [None]:
classifier

In [None]:
#TRAIN THE MODEL
#first split into test and train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
x_train.shape, x_test.shape

((2198,), (550,))

In [None]:
x_train.head()

2572                                       Awful service.
526                      horrible, had to switch 3 times.
1509    The fat computer geek was unbelievable, the bi...
144     The phone takes FOREVER to charge like 2 to 5 ...
2483    Probably never coming back, and wouldn't recom...
Name: Review, dtype: object

In [None]:
x_test.head()

1801    They have horrible attitudes towards customers...
1590                                              10/10  
2382              Ordered burger rare came in we'll done.
2447    Anyways, The food was definitely not filling a...
1147               This is actually a very smart movie.  
Name: Review, dtype: object

In [None]:
###fit the data
clf = Pipeline([('tfidf',tokenizer), ('clf',classifier)])

In [None]:
clf.fit(x_train, y_train)



In [None]:
### NOW PREDICT
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
y_pred = clf.predict(x_test)

In [None]:
# confusion_matrix
confusion_matrix(y_test, y_pred)

array([[221,  60],
       [ 58, 211]])

In [None]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.79      0.79      0.79       281
           1       0.78      0.78      0.78       269

    accuracy                           0.79       550
   macro avg       0.79      0.79      0.79       550
weighted avg       0.79      0.79      0.79       550



In [None]:
accuracy_score(y_test, y_pred)


0.7854545454545454

In [None]:
clf.predict(["Wow, I am learning Natural Language Processing in fun fashion!"])


array([1])

In [None]:
clf.predict(["this is boring"])

array([0])