In [1]:
import numpy as np
import pandas as pd

Description of the dataset to be used:
 

- Columns separated by \t (tab space)
- First column is about reviews of people
- In second column, 0 is for negative review and 1 is for positive review

In [2]:
df = pd.read_csv("Restaurant_Reviews.tsv",delimiter ="\t", quoting =3)


In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [15]:
df.shape

(1000, 2)

Hedefimiz, bu veri setinden bir model oluşturmak ve yeni inceleme metninin olumlu veya olumsuz olduğunu tahmin etmektir. 

### Preprocessing the data

In [4]:
import re # library to clean data

In [6]:
review = re.sub("[^a-zA-Z]"," ",df["Review"][0])

# The first parameter [^a-zA-Z] is the characters that are not removed.

In [7]:
review

'Wow    Loved this place '

In [14]:
review = re.sub("[^a-zA-Z]"," ",df["Review"][4])
print(review.lower())
print(review.split())

the selection on the menu was great and so were the prices 
['The', 'selection', 'on', 'the', 'menu', 'was', 'great', 'and', 'so', 'were', 'the', 'prices']


In [17]:
# Natural Language Tool Kit
import nltk
 
nltk.download('stopwords')
 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/neslihanyetik/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [18]:
# for Stemming propose
from nltk.stem.porter import PorterStemmer

In [19]:
ps = PorterStemmer()

In [20]:
 from nltk.corpus import stopwords # corpus: değerler 

In [21]:
# loop for stemming each word
# in string array at ith row   
# kelimenin gövdesini bulmuş oluyoruz.
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review

['t',
 'h',
 'e',
 ' ',
 'e',
 'l',
 'e',
 'c',
 'n',
 ' ',
 'n',
 ' ',
 'h',
 'e',
 ' ',
 'e',
 'n',
 'u',
 ' ',
 'w',
 ' ',
 'g',
 'r',
 'e',
 ' ',
 'n',
 ' ',
 ' ',
 'w',
 'e',
 'r',
 'e',
 ' ',
 'h',
 'e',
 ' ',
 'p',
 'r',
 'c',
 'e',
 ' ']

In [22]:
# rejoin all string array elements
# to create back into a string
review = ' '.join(review) 
review

't h e   e l e c n   n   h e   e n u   w   g r e   n     w e r e   h e   p r c e  '

In [24]:
# Initialize empty array
# to append clean text
corpus = []
 
# 1000 (reviews) rows to clean
for i in range(0, 1000):
     
    # column : "Review", row ith
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
     
    # convert all cases to lower cases
    review = review.lower()
     
    # split to array(default delimiter is " ")
    review = review.split()
     
    # creating PorterStemmer object to
    # take main stem of each word
    ps = PorterStemmer()
     
    # loop for stemming each word
    # in string array at ith row   
    review = [ps.stem(word) for word in review
                if not word in set(stopwords.words('english'))]
                 
    # rejoin all string array elements
    # to create back into a string
    review = ' '.join(review) 
     
    # append each string to create
    # array of clean text
    corpus.append(review)

### Feature Engineering and Modeling

In [25]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer


In [26]:
# To extract max 1500 feature.
# "max_features" is attribute to
# experiment with to get better results
# "max_features" ile kelime frekansını ayarlıyoruz
cv = CountVectorizer(max_features = 1500)

In [29]:
# X contains corpus (dependent variable)
X = cv.fit_transform(corpus).toarray()
 
# y contains answers if review
# is positive or negative
y = df.iloc[:, 1].values

In [31]:
# Splitting the dataset into
# the Training set and Test set
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

 
# experiment with "test_size"
# to get better results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [32]:
# Fitting Random Forest Classification
# to the Training set
from sklearn.ensemble import RandomForestClassifier
 
# n_estimators can be said as number of
# trees, experiment with n_estimators
# to get better results
model = RandomForestClassifier(n_estimators = 501,
                            criterion = 'entropy')
                             
model.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=501)

In [33]:
# Predicting the Test set results
y_pred = model.predict(X_test)
 
y_pred

array([0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0])

In [34]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
 
cm = confusion_matrix(y_test, y_pred)
 
cm

array([[100,  18],
       [ 48,  84]])