## Natural Language Processing


### Importing the libraries


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


### Importing the dataset


In [None]:
# tab separate value
# quoting=3 -> ignore all quotes
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

### Cleaning the texts


In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
import nltk
# nltk.download('stopwords') the,a,an,I,this...

row_count = len(dataset)  # 1000
corpus = []

for i in range(0, row_count):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()  # split in whitespace
    ps = PorterStemmer()  # simplify words loved -> love
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    all_stopwords.remove('isn\'t')
    review = [ps.stem(word)
              for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)


In [None]:
print(corpus)

### Creating the Bag of Words model


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

### Splitting the dataset into the Training set and Test set


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0)


### Training the Logistic Regression model on the Training set


In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)


### Predicting the Test set results


In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))
# [prediction,reality]

### Making the Confusion Matrix


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
# accuracy = 0.77