## Import libraries

In [4]:
import pandas as pd
import numpy as np

## Get the data

In [5]:
train_data = pd.read_csv("../Data/train.csv")[:100000]
test_data = pd.read_csv("../Data/test.csv")[:20000]
test_data.head()

Unnamed: 0,SUGGEST,TITLE,SENTENCE
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [6]:
X_train = train_data["SENTENCE"]
y_train = train_data["SUGGEST"]
X_test = test_data["SENTENCE"]
y_test = test_data["SUGGEST"]

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
label_encoder = LabelEncoder()

In [9]:
y_train_labeled = label_encoder.fit_transform(y_train)
y_test_labeled = label_encoder.fit_transform(y_test)

## Cleaning the data

In [10]:
from nltk.corpus import stopwords
import nltk # Natural Language Tool Kit
from bs4 import BeautifulSoup
import re

In [11]:
nltk.download("stopwords")
new_list = stopwords.words("english") # Download stopwords in English

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
stop_words = set(stopwords.words("english"))

In [13]:
def clean_sentence(sentence):
    if sentence != "":
        sentence = BeautifulSoup(sentence, "lxml").get_text()
        sentence = re.sub("[^a-zA-Z]", " ", sentence)
        sentence = sentence.lower()
        words_list = sentence.split()
        words_list_cleaned = [word for word in words_list if word not in stop_words]
        return (" ".join(words_list_cleaned))

## Clear all data

In [14]:
X_train_all = []
X_test_all = []

for i in range(len(X_train)):
    if (i + 1) % 1000 == 0:
        print(f"{i} sentence cleaned and loaded.")
    
    X_train_all.append(clean_sentence(X_train[i]))

for i in range(len(X_test)):
    if (i + 1) % 1000 == 0:
        print(f"{i} sentence cleaned and loaded.")
    
    X_test_all.append(clean_sentence(X_test[i]))

  sentence = BeautifulSoup(sentence, "lxml").get_text()


999 sentence cleaned and loaded.
1999 sentence cleaned and loaded.
2999 sentence cleaned and loaded.
3999 sentence cleaned and loaded.
4999 sentence cleaned and loaded.
5999 sentence cleaned and loaded.
6999 sentence cleaned and loaded.
7999 sentence cleaned and loaded.
8999 sentence cleaned and loaded.
9999 sentence cleaned and loaded.
10999 sentence cleaned and loaded.
11999 sentence cleaned and loaded.
12999 sentence cleaned and loaded.
13999 sentence cleaned and loaded.
14999 sentence cleaned and loaded.
15999 sentence cleaned and loaded.
16999 sentence cleaned and loaded.
17999 sentence cleaned and loaded.
18999 sentence cleaned and loaded.
19999 sentence cleaned and loaded.
20999 sentence cleaned and loaded.
21999 sentence cleaned and loaded.
22999 sentence cleaned and loaded.
23999 sentence cleaned and loaded.
24999 sentence cleaned and loaded.
25999 sentence cleaned and loaded.
26999 sentence cleaned and loaded.
27999 sentence cleaned and loaded.
28999 sentence cleaned and load

## Tokenize the words

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
vectorizer = CountVectorizer(max_features=2000)

In [17]:
X_train_vectorized = vectorizer.fit_transform(X_train_all)
X_test_vectorized = vectorizer.transform(X_test_all)

X_train_vectorized = X_train_vectorized.toarray()
X_test_vectorized = X_test_vectorized.toarray()

X_train_vectorized

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Get and train the models

### Logistic regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
logistic_regression_model = LogisticRegression(solver="lbfgs", max_iter=3000)
logistic_regression_model.fit(X_train_vectorized, y_train_labeled)

In [20]:
y_preds_log = logistic_regression_model.predict(X_test_vectorized)
y_preds_log

array([1, 1, 0, ..., 1, 1, 0])

In [21]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [22]:
acc_log = accuracy_score(y_test_labeled, y_preds_log)
acc_log_percent = acc_log * 100
print(f"Logistic Regression Model's accuracy is % {acc_log_percent:.2f}")

roc_log = roc_auc_score(y_test_labeled, y_preds_log)
roc_log_percent = roc_log * 100
print(f"Logistic Regression Model's roc_auc_score is % {roc_log_percent:.2f}")

Logistic Regression Model's accuracy is % 84.69
Logistic Regression Model's roc_auc_score is % 84.65


### Decision Tree Model

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
decision_tree_classifier = DecisionTreeClassifier()
decision_tree_classifier.fit(X_train_vectorized, y_train_labeled)

In [25]:
y_preds_dtc = decision_tree_classifier.predict(X_test_vectorized)
y_preds_dtc

array([1, 0, 1, ..., 1, 0, 1])

In [26]:
acc_dtc = accuracy_score(y_test_labeled, y_preds_dtc)
acc_dtc_percent = acc_dtc * 100
print(f"Decision Tree Classifier Model's accuracy is % {acc_dtc_percent:.2f}")

Decision Tree Classifier Model's accuracy is % 72.17


### Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
random_forest_classifier = RandomForestClassifier(n_estimators=100)
random_forest_classifier.fit(X_train_vectorized, y_train_labeled)

In [29]:
y_preds_rf = random_forest_classifier.predict(X_test_vectorized)
y_preds_rf

array([1, 1, 0, ..., 1, 1, 1])

In [30]:
acc_rf = accuracy_score(y_test_labeled, y_preds_rf)
acc_rf_percent = acc_rf * 100
print(f"Random Forest Model's accuracy is % {acc_rf_percent:.2f}")

Random Forest Model's accuracy is % 82.14


### Gaussian Naive Bayes

In [31]:
from sklearn.naive_bayes import GaussianNB

In [32]:
gaussianNB = GaussianNB()
gaussianNB.fit(X_train_vectorized, y_train_labeled)

In [33]:
y_preds_gnb = gaussianNB.predict(X_test_vectorized)
y_preds_gnb

array([1, 1, 0, ..., 1, 0, 1])

In [34]:
acc_gnb = accuracy_score(y_test_labeled, y_preds_gnb)
acc_gnb_percent = acc_gnb * 100
print(f"Gaussian Naive Bayes Model's accuracy is % {acc_gnb_percent:.2f}")

Gaussian Naive Bayes Model's accuracy is % 77.28


### Bernoulli Naive Bayes

In [35]:
from sklearn.naive_bayes import BernoulliNB

In [36]:
bernoulliNB = BernoulliNB()
bernoulliNB.fit(X_train_vectorized, y_train_labeled)

In [37]:
y_preds_bnb = bernoulliNB.predict(X_test_vectorized)
y_preds_bnb

array([1, 1, 0, ..., 1, 1, 1])

In [38]:
acc_bnb = accuracy_score(y_test_labeled, y_preds_bnb)
acc_bnb_percent = acc_bnb * 100
print(f"Bernoulli Naive Bayes Model's accuracy is % {acc_bnb_percent:.2f}")

Bernoulli Naive Bayes Model's accuracy is % 81.64


## Best model to choose is Logistic Regression Model with % 84.7 accuracy

## Give manual data

In [47]:
reviews = ["It is the best thing I have ever seen.", "It made me disappointed.", "Thank you so much for everything", "I will not buy anything again from this company.", "It works really well", "Do not buy it", "I do not suggest it", "It is worthless"]
reviews_cleaned = []

for review in reviews:
    reviews_cleaned.append(clean_sentence(review))

reviews_cleaned

['best thing ever seen',
 'made disappointed',
 'thank much everything',
 'buy anything company',
 'works really well',
 'buy',
 'suggest',
 'worthless']

In [48]:
reviews_cleaned_vectorized = vectorizer.transform(reviews_cleaned)
reviews_cleaned_vectorized

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 18 stored elements and shape (8, 2000)>

In [49]:
y_pred_manual = logistic_regression_model.predict(reviews_cleaned_vectorized)
y_pred_manual

array([1, 0, 1, 0, 1, 0, 0, 0])

In [50]:
suggestion_dict = {0: "Not Suggest", 1: "Suggest"}

for idx, review in enumerate(reviews):
    print(f"The review '{review}' expresses '{suggestion_dict[y_pred_manual[idx]]}'.")

The review 'It is the best thing I have ever seen.' expresses 'Suggest'.
The review 'It made me disappointed.' expresses 'Not Suggest'.
The review 'Thank you so much for everything' expresses 'Suggest'.
The review 'I will not buy anything again from this company.' expresses 'Not Suggest'.
The review 'It works really well' expresses 'Suggest'.
The review 'Do not buy it' expresses 'Not Suggest'.
The review 'I do not suggest it' expresses 'Not Suggest'.
The review 'It is worthless' expresses 'Not Suggest'.
