**1. Load all Libraries**

In [129]:

#loading the unprocessed data
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk
from google.colab import drive
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
import re
import string
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import accuracy_score
from sklearn import model_selection, naive_bayes, svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import os
     

**2. Load dataset and generate train and test dataset**

In [130]:
df = pd.read_excel('Assignment 3.xlsx')
df['label'] = df['label'].map({'restaurant': 1, 'movie': 0 })
df_test=pd.concat([df.iloc[400:501],df.iloc[901:1000]])
df_train=pd.concat([df.iloc[0:400],df.iloc[500:900]])
     

**3. Create corpus, Lemmatize, remove stopwords and punctuations for train dataset**

In [131]:
corpus=[] #Create an empty list
for i in df_train['review']: #Use a for loop to tokenize each review in the collection
    corpus.append(nltk.word_tokenize(i)) #append each tokenized review as a list to the empty list

lemma=[] #create an empty list
lemmatizer = nltk.stem.WordNetLemmatizer() #intialize the WordLemmatizer
for token in corpus: #For loop to access the outer list
    lemmatized_token = [lemmatizer.lemmatize(item.lower()) for item in token]
#lemmatize each word in the inner list after converting them to lower case
    lemma.append(lemmatized_token) #Append each lemmatized list of words to the empty list
  
from nltk.corpus import stopwords #import stopwords from nltk.corpus
final=[] #create an empty list
for word in lemma:
    stop_words_removed = [i for i in word if i not in stopwords.words('english') if i.isalnum()] # filter out stopwords and retain only alphanumeric characters
    final.append(stop_words_removed) #append the final list of lemmatized, stopwords removed , alpha numeric words of a given review to the empty list

tfidf_list=[] 
for i in final:
    aud=" ".join(i)
    tfidf_list.append(aud) 



**4. Generate TFIDF Text representation with mindf=5 and including bigrams for the train dataset**

In [223]:
df_train['cleaned_review']=tfidf_list
labels=list(df_train['label'])

In [224]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5) #include one and two grams and specify minimum document frequency =3
vectorizer.fit(tfidf_list) # apply fit method on the list of reviews
vectorizer_output = vectorizer.transform(tfidf_list)#apply transform method to generate TD-IDF vales for each of the unique words for each document
print(vectorizer_output.toarray()) # convert the output to a 2D array
print(vectorizer.vocabulary_) # prints the vocabulary considered for computing tf-idf values
print(vectorizer_output.toarray().shape)# prints shape of the array

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.16308083 0.07648077 0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
{'shop': 4157, 'restaurant': 3861, 'soi': 4279, 'road': 3917, 'inside': 2338, 'hotel': 2226, 'decorated': 1128, 'style': 4468, 'white': 5116, 'wall': 5039, 'glass': 1930, 'table': 4555, 'red': 3763, 'chair': 725, 'floor': 1759, 'dim': 1224, 'light': 2593, 'open': 3128, 'night': 3046, 'romantic': 3931, 'atmosphere': 362, 'duck': 1328, '39': 46, 'orange': 3140, 'pork': 3382, 'french': 1830, 'onion': 3126, 'soup': 4302, 'average': 394, 'price': 3440, 'baht': 417, 'food': 1773, 'review': 3888, 'breast': 598, 'sliced': 4237, 'piece': 3318, 'tender': 4627,

**5. Modeling**

**a. Naive Bayes Classifier**

In [225]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
classifier_NB = MultinomialNB()
classifier_NB.fit(vectorizer_output, labels)
X_test=list(df_test['review'])
new_X = vectorizer.transform(X_test)
predictions_NB = classifier_NB.predict(new_X)
print("Accuracy of Naive Bayes model is: ",accuracy_score(list(df_test['label']),predictions_NB))
print("ROC of Naive Bayes model is: ",roc_auc_score(list(df_test['label']),predictions_NB))

Accuracy of Naive Bayes model is:  0.985
ROC of Naive Bayes model is:  0.985


**b. Logit classification**

In [226]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
classifier_logit = LogisticRegression()
classifier_logit.fit(vectorizer_output, labels)
X_test=list(df_test['review'])
new_X = vectorizer.transform(X_test)
predictions_logit = classifier_logit.predict(new_X)
print("Accuracy of logit model is: ",accuracy_score(list(df_test['label']),predictions_logit))
print("ROC of logit model is: ",roc_auc_score(list(df_test['label']),predictions_logit))

Accuracy of logit model is:  0.99
ROC of logit model is:  0.99


**c. Random Forest Model**

In [227]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
classifier_RF = RandomForestClassifier(n_estimators=50)
classifier_RF.fit(vectorizer_output, labels)
X_test=list(df_test['review'])
new_X = vectorizer.transform(X_test)
predictions_RF = classifier_RF.predict(new_X)
print("Accuracy of Random forest is: ",accuracy_score(list(df_test['label']),predictions_RF))
print("ROC of Random Forest is: ",roc_auc_score(list(df_test['label']),predictions_RF))


Accuracy of Random forest is:  0.995
ROC of Random Forest is:  0.995


**d. Support vector Machine**

In [228]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
classifier_SVC = SVC()
classifier_SVC.fit(vectorizer_output, labels)
X_test=list(df_test['review'])
new_X = vectorizer.transform(X_test)
predictions_SVC = classifier_SVC.predict(new_X)
print("Accuracy of SVC is: ",accuracy_score(list(df_test['label']),predictions_SVC))
print("ROC of SVC is: ",roc_auc_score(list(df_test['label']),predictions_SVC))

Accuracy of SVC is:  0.99
ROC of SVC is:  0.99


**e. ANN model wiht one hidden layer**

In [231]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import Flatten
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# Create the ANN model
model = Sequential()
model.add(Dense(4, input_dim=vectorizer_output.shape[1], activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile and train the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
df3=pd.DataFrame(vectorizer_output.toarray())
model.fit(df3, df_train['label'], epochs=10, batch_size=32)

X_test=list(df_test['review'])
new_X = vectorizer.transform(X_test)
df4=pd.DataFrame(new_X.toarray())

print(model.evaluate(df4,df_test['label']))

predictions=model.predict(df4)
y_pred=[]
for i in predictions:
  if i>0.5:
    y_pred.append(1)
  else:
    y_pred.append(0)

print("Accuracy of ANN is: ",accuracy_score(list(df_test['label']),y_pred))
print("ROC of ANN is: ",roc_auc_score(list(df_test['label']),y_pred))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0.3595365285873413, 0.9049999713897705]
Accuracy of ANN is:  0.905
ROC of ANN is:  0.905


**6. Index encoding with max_length of 100 padding**



In [235]:
# Import necessary modules
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

# Define the maximum length for the input sequences
MAX_LENGTH = 100

# Create a tokenizer to map words to integers
tokenizer = Tokenizer()

# Fit the tokenizer on the documents
tokenizer.fit_on_texts(tfidf_list)

# Encode the documents using the tokenizer
encoded_docs = tokenizer.texts_to_sequences(tfidf_list)

# Pad the encoded documents to the maxmimum length
padded_docs = pad_sequences(encoded_docs, maxlen=MAX_LENGTH, padding='post')


**7. Deep learning model with sequential structure**

In [261]:
# Import required libraries
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense

padded=pd.DataFrame(padded_docs)

# Define model architecture
model = tf.keras.Sequential([
    Embedding(input_dim=np.amax(padded_docs)+1,
              output_dim=20),
    LSTM(40, dropout=0.2, recurrent_dropout=0.2),
    Dropout(0.1),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(padded, df_train['label'], batch_size=100, epochs=10)

print(model.evaluate(df4,df_test['label']))
predictions_LSTM=model.predict(df4)
y_pred=[]
for i in predictions_LSTM:
  if i>0.5:
    y_pred.append(1)
  else:
    y_pred.append(0)

print("Accuracy of Sequential model is: ",accuracy_score(list(df_test['label']),y_pred))
print("ROC of Sequential model is: ",roc_auc_score(list(df_test['label']),y_pred))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[7.624619007110596, 0.5]
Accuracy of Sequential model is:  0.5
ROC of Sequential model is:  0.5
