In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,classification_report

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/macbook/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
data = pd.read_csv('cleaned_data.csv')
data.head(10)

Unnamed: 0.1,Unnamed: 0,source,review_description,rating,review_date
0,0,Google Play,thread meta fantastic app staying connected cl...,5,2024-01-03 10:16:17
1,1,Google Play,nothing scammer trust,1,2024-01-01 13:36:11
2,2,Google Play,awesome,5,2023-12-17 22:40:19
3,3,Google Play,nice app,3,2023-12-14 01:49:26
4,4,Google Play,great alternative twitter x,5,2023-12-08 19:32:42
5,5,Google Play,soild,5,2023-12-08 19:14:14
6,6,Google Play,really good,5,2023-12-08 18:56:58
7,7,Google Play,excellent,5,2023-12-08 18:28:56
8,8,Google Play,social medium platform rife bot cat-fish type ...,1,2023-12-08 17:56:30
9,9,Google Play,freedom,1,2023-12-08 17:54:12


In [6]:
# Assuming df is your DataFrame
data = data.dropna()

In [7]:
#df['rating']=df['rating'].map({1:-1,2:-1,3:0,4:1,5:1})
data['rating']=data['rating'].map({1:'Negative',2:'Negative',3:'Neutral',4:'Positive',5:'Positive'})
data

Unnamed: 0.1,Unnamed: 0,source,review_description,rating,review_date
0,0,Google Play,thread meta fantastic app staying connected cl...,Positive,2024-01-03 10:16:17
1,1,Google Play,nothing scammer trust,Negative,2024-01-01 13:36:11
2,2,Google Play,awesome,Positive,2023-12-17 22:40:19
3,3,Google Play,nice app,Neutral,2023-12-14 01:49:26
4,4,Google Play,great alternative twitter x,Positive,2023-12-08 19:32:42
...,...,...,...,...,...
45280,2195,App Store,many false information cant ignore cant hide r...,Negative,2023-12-13 19:25:04
45281,2196,App Store,app malfunction constantly wont let post anyth...,Negative,2023-12-13 03:01:16
45282,2197,App Store,thread messaging thread weaving fabric life,Positive,2023-11-29 13:49:52
45283,2198,App Store,white house need account,Positive,2023-11-27 18:51:49


In [8]:
X = data['review_description']
y = data['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((33963,), (11321,), (33963,), (11321,))

In [9]:
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [10]:
cv = CountVectorizer()
cv.fit(X_train)
X_train_count = cv.transform(X_train)
X_test_count = cv.transform(X_test)

In [11]:
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2,3))
tfidf.fit(X_train)
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [12]:
rf1 = RandomForestClassifier()
rf1.fit(X_train_count, y_train)
rf_cv = rf1.score(X_test_count, y_test)
print(f'Random Forest Classifier on Count Vectors: {rf_cv}')

Random Forest Classifier on Count Vectors: 0.7825280452256868


In [13]:
rf2 = RandomForestClassifier()
rf2.fit(X_train_tfidf, y_train)
rf_tfidf = rf2.score(X_test_tfidf, y_test)
print(f'Random Forest Classifier on TF-IDF Vectors: {rf_tfidf}')

Random Forest Classifier on TF-IDF Vectors: 0.7836763536790036


In [14]:
mnb1 = MultinomialNB()
mnb1.fit(X_train_count, y_train)
mnb_cv = mnb1.score(X_test_count, y_test)
print(f'Multinomial Naive Bayes Classifier on Count Vectors: {mnb_cv}')

Multinomial Naive Bayes Classifier on Count Vectors: 0.7923328327886229


In [15]:
mnb2 = MultinomialNB()
mnb2.fit(X_train_tfidf, y_train)
mnb_tfidf = mnb2.score(X_test_tfidf, y_test)
print(f'Multinomial Naive Bayes Classifier on TF-IDF Vectors: {mnb_tfidf}')

Multinomial Naive Bayes Classifier on TF-IDF Vectors: 0.7546153166681389


In [16]:
model = {'Model':['RandomForestClassifier-CountVectors',
                  'RandomForestClassifier-TFIDFVectors',
                  'MultinomialNBClassifier-CountVectors',
                  'MultinomialNBClassifier-TFIDFVectors'],
         'Score':[rf_cv, rf_tfidf, mnb_cv, mnb_tfidf]
         }
model_df = pd.DataFrame(model)
model_df

Unnamed: 0,Model,Score
0,RandomForestClassifier-CountVectors,0.782528
1,RandomForestClassifier-TFIDFVectors,0.783676
2,MultinomialNBClassifier-CountVectors,0.792333
3,MultinomialNBClassifier-TFIDFVectors,0.754615
