In [26]:
import pandas as pd
import numpy as np
import re # regular expression
import nltk # natural language tool kit

In [27]:
df = pd.read_csv('a2_RestaurantReviews_FreshDump.csv')
df.head()

Unnamed: 0,Review
0,Spend your money elsewhere.
1,Their regular toasted bread was equally satisf...
2,The Buffet at Bellagio was far from what I ant...
3,"And the drinks are WEAK, people!"
4,#NAME?


### Data Cleaning 

In [28]:
nltk.download('stopwords')

from nltk.corpus import stopwords # removing commong words
from nltk.stem.porter import PorterStemmer # reducing words (stemming)
ps = PorterStemmer()

all_stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ext.mark.mariscotes\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
corpus = []

# Loop through each row in the dataset until there's no data left
i = 0
while i < len(df) and df['Review'][i]:
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)
    i += 1

In [30]:
corpus

['spend money elsewher',
 'regular toast bread equal satisfi occasion pat butter mmmm',
 'buffet bellagio far anticip',
 'drink weak peopl',
 'name',
 'also feel like chip bought made hous',
 'disappoint dinner went elsewher dessert',
 'chip sal amaz',
 'return',
 'new fav vega buffet spot',
 'serious cannot believ owner mani unexperienc employe run around like chicken head cut',
 'sad',
 'felt insult disrespect could talk judg anoth human like',
 'call steakhous properli cook steak understand',
 'impress concept food',
 'thing crazi guacamol like pur ed',
 'realli noth postino hope experi better',
 'got food poison buffet',
 'brought fresh batch fri think yay someth warm',
 'hilari yummi christma eve dinner rememb biggest fail entir trip us',
 'needless say go back anytim soon',
 'place disgust',
 'everi time eat see care teamwork profession degre',
 'ri style calamari joke',
 'howev much garlic fondu bare edibl',
 'could bare stomach meal complain busi lunch',
 'bad lost heart finish

### Data Transformation

In [37]:
# Loading BoW dictionary
from sklearn.feature_extraction.text import CountVectorizer
import pickle
cvFile='c1_BoW_Sentiment_Model.pkl'
cv = pickle.load(open(cvFile, "rb"))

In [38]:
X_fresh = cv.transform(corpus).toarray()
X_fresh.shape

(100, 1420)

### Prediction (via Sentiment Classifier = c1_Bow_Sentiment_Model)

In [39]:
import joblib
classifier = joblib.load('c2_Classifier_Sentiment_Model')

In [40]:
y_pred = classifier.predict(X_fresh)
print(y_pred)

[0 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0
 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0]


In [45]:
df['predicted_label'] = y_pred.tolist()
df.head()

Unnamed: 0,Review,predicted_label
0,Spend your money elsewhere.,0
1,Their regular toasted bread was equally satisf...,1
2,The Buffet at Bellagio was far from what I ant...,1
3,"And the drinks are WEAK, people!",0
4,#NAME?,1


In [46]:
df.to_csv('predicted_labels.csv', index=False)