## Import Libraries

In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

import pickle
from sklearn.model_selection import RandomizedSearchCV

  from pandas.core import (


## Data Loading

In [2]:
data = pd.read_csv('data_final.csv')
data.head()

Unnamed: 0,Review,Rating
0,Beli Online memang paling aman di official store,bintang 5
1,"Barang masih di segel, tepat waktu, dan packin...",bintang 5
2,"mantab greget, penguriman aman,sampai tujuan s...",bintang 5
3,"pertama kali beli online, aman. packing rapi",bintang 5
4,"Barang bagus, berfungsi dengan baik, untuk pen...",bintang 4


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19540 entries, 0 to 19539
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  19540 non-null  object
 1   Rating  19540 non-null  object
dtypes: object(2)
memory usage: 305.4+ KB


## Feature Engineering

In [4]:
data_fe = data.copy()
data_fe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19540 entries, 0 to 19539
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  19540 non-null  object
 1   Rating  19540 non-null  object
dtypes: object(2)
memory usage: 305.4+ KB


### Data Grouping

In [10]:
data_fe['Rating'] = data_fe['Rating'].str.replace('bintang ', '').astype(int)

In [11]:
def categorize(val):
    if val >= 4:
        return 'Positive'
    else:
        return 'Negative'

data_fe['Rating'] = data_fe['Rating'].apply(categorize)

In [12]:
data_fe['Rating'].value_counts()

Rating
Positive    18850
Negative      690
Name: count, dtype: int64

### Data Splitting

In [13]:
X = data_fe['Review']
y = data_fe['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                   random_state=104,  
                                   test_size=0.25,  
                                   shuffle=True) 

print('Total Data   : ', X.shape)
print('Total Labels : ', y.shape)
print('Train Size   : ', X_train.shape)
print('Test Size    : ', X_test.shape)

Total Data   :  (19540,)
Total Labels :  (19540,)
Train Size   :  (14655,)
Test Size    :  (4885,)


In [14]:
X_train.head()

7721     produk sesuai deskripsi, pengiriman aman, teri...
10100            Mantap..sesuai pesanan & pengiriman cepat
5765     barang bagus+new+ori, packing aman, warna bagu...
17131    hp poco x6 pro 5g sudah diterima sesuai dgn pe...
9778     Barang diterima dalam keadaan baik, segel masi...
Name: Review, dtype: object

In [15]:
y_train.head()

7721     Positive
10100    Positive
5765     Positive
17131    Positive
9778     Positive
Name: Rating, dtype: object

### Text PreProcessing

In [None]:
nltk.download('stopwords')

try:
    stop_words = stopwords.words('indonesian')
except:
    stop_words = [
        'yang', 'dan', 'di', 'ke', 'dari', 'ini', 'untuk', 'dengan', 
        'pada', 'adalah', 'atau', 'seperti', 'oleh', 'akan', 'dalam', 'itu'
    ]

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    words = text.split()
    cleaned_text = ' '.join([word for word in words if word not in stop_words])
    return cleaned_text

X_train_cleaned = X_train.apply(clean_text)
X_test_cleaned = X_test.apply(clean_text)

vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_cleaned)
X_test_vectorized = vectorizer.transform(X_test_cleaned)

X_train_dense = X_train_vectorized.toarray()
X_test_dense = X_test_vectorized.toarray()

print('Train Data Shape:', X_train_dense.shape)
print('Test Data Shape:', X_test_dense.shape)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\surya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Train Data Shape: (14655, 9797)
Test Data Shape: (4885, 9797)


## Model Definition

In [17]:
lg = LogisticRegression(random_state=12)
mnb = MultinomialNB()
svm = SVC(random_state=12)
dt = DecisionTreeClassifier(random_state=12)
rf = RandomForestClassifier(random_state=12)

### Model Training

In [18]:
lg.fit(X_train_dense, y_train)

In [19]:
mnb.fit(X_train_dense, y_train)

In [20]:
svm.fit(X_train_dense, y_train)

In [21]:
dt.fit(X_train_dense, y_train)

In [22]:
rf.fit(X_train_dense, y_train)

### Model Evaluation

In [23]:
# Logistic Regression
pres_train_cross_val_lg = cross_val_score(lg, 
                                          X_train_dense, 
                                          y_train, 
                                          cv=5, 
                                          scoring="f1_macro")

# Multinomial Naive Bayes
pres_train_cross_val_mnb = cross_val_score(mnb, 
                                           X_train_dense,
                                           y_train, 
                                           cv=5, 
                                           scoring="f1_macro")

# Support Vector Machine
pres_train_cross_val_svm = cross_val_score(svm, 
                                           X_train_dense,
                                           y_train, 
                                           cv=5, 
                                           scoring="f1_macro")

# Decision Tree Classifier
pres_train_cross_val_dt = cross_val_score(dt, 
                                          X_train_dense, 
                                          y_train, 
                                          cv=5, 
                                          scoring="f1_macro")

# Random Forest Classifier
pres_train_cross_val_rf = cross_val_score(rf, 
                                          X_train_dense, 
                                          y_train, 
                                          cv=5, 
                                          scoring="f1_macro")

In [24]:
y_pred_train_lg = lg.predict(X_train_dense)
y_pred_test_lg = lg.predict(X_test_dense)
y_pred_train_mnb = mnb.predict(X_train_dense)
y_pred_test_mnb = mnb.predict(X_test_dense)
y_pred_train_svm = svm.predict(X_train_dense)
y_pred_test_svm = svm.predict(X_test_dense)
y_pred_train_dt = dt.predict(X_train_dense)
y_pred_test_dt = dt.predict(X_test_dense)
y_pred_train_rf = rf.predict(X_train_dense)
y_pred_test_rf = rf.predict(X_test_dense)

In [25]:
all_reports = {}

def performance_report(all_reports, pres_train_cross_val, y_train_resampled, y_pred_train, y_test, y_pred_test, name):
    score_reports = {
        'train - F1': f1_score(y_train_resampled, y_pred_train, average='macro'),  # or 'micro' or 'weighted'
        'train - crossval_F1': pres_train_cross_val.mean(),
        'test - F1': f1_score(y_test, y_pred_test, average='macro'),  # or 'micro' or 'weighted'
    }
    all_reports[name] = score_reports
    return all_reports

# Logistic Regression
all_reports = performance_report(all_reports, pres_train_cross_val_lg, y_train, y_pred_train_lg, y_test, y_pred_test_lg, 'Logistic Regression - Baseline (Default Hyperparameter)')

# MultinomialNB
all_reports = performance_report(all_reports, pres_train_cross_val_mnb, y_train, y_pred_train_mnb, y_test, y_pred_test_mnb, 'MultinomialNB - Baseline (Default Hyperparameter)')

# SVM
all_reports = performance_report(all_reports, pres_train_cross_val_svm, y_train, y_pred_train_svm, y_test, y_pred_test_svm, 'SVM - Baseline (Default Hyperparameter)')

# Decision Tree
all_reports = performance_report(all_reports, pres_train_cross_val_dt, y_train, y_pred_train_dt, y_test, y_pred_test_dt, 'D Tree - Baseline (Default Hyperparameter)')

# Random Forest
all_reports = performance_report(all_reports, pres_train_cross_val_rf, y_train, y_pred_train_rf, y_test, y_pred_test_rf, 'R Forest - Baseline (Default Hyperparameter)')

# Display the reports
pd.DataFrame(all_reports)


Unnamed: 0,Logistic Regression - Baseline (Default Hyperparameter),MultinomialNB - Baseline (Default Hyperparameter),SVM - Baseline (Default Hyperparameter),D Tree - Baseline (Default Hyperparameter),R Forest - Baseline (Default Hyperparameter)
train - F1,0.570839,0.50649,0.864944,0.99494,0.994892
train - crossval_F1,0.515811,0.491022,0.517539,0.630456,0.542924
test - F1,0.522955,0.490721,0.507162,0.621345,0.517114


### Model Saving

In [None]:
with open('dt_model.pkl', 'wb') as f:
    pickle.dump(dt, f)