In [None]:
import pandas as pd
import numpy as np
import re
import ast
import joblib
import warnings
import nltk
from wordcloud import WordCloud
from googletrans import Translator
from langdetect import detect
from collections import Counter

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text Processing
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

# Set warnings and plotting
warnings.filterwarnings("ignore")
%matplotlib inline


DATASET INITIALIZATION

In [None]:
dataset = pd.read_csv('rm_dataset.csv')
df = dataset

In [None]:
df

In [None]:
df.notnull().sum()

In [None]:
df['rating'].value_counts()

Data Preprocessing

In [None]:
#Manual Feature Selection
df = df.drop('name', axis=1)
df

In [None]:
#Custom Binning
def custom_binning(rating):
    if rating < 2.9:
        return 'bad'
    elif 2.9 <= rating < 3.9:
        return 'neutral'
    elif 3.9 <= rating <= 5.0:
        return 'positive'

df['category'] = df['rating'].apply(custom_binning)
df

In [None]:
#Data Cleaning
def process_text(text):
    text = text.lower() #Menghubah huruf ke huruf kecil
    text = re.sub('@[^\s]+', '', text) #Menghapus @
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', text) #Menghapus URL
    text = re.sub(r"\d+", " ", str(text)) #Menghapus angka
    text = re.sub('&quot;', " ", text) #Menghapus entity HTML
    text = re.sub(r"\b[a-zA-Z]\b", "", str(text)) #Menghapus karakter tunggal
    text = re.sub(r"[^\w\s]", " ", str(text)) #Menghapus tanda baca
    text = re.sub(r'(.)\1+', r'\1\1', text) #Menghapus karakter yang diulang
    text = re.sub(r"\s+", " ", str(text)) #Menghapus spasi berlebih
    return text

df['desc'] = df['desc'].apply(process_text)
df

Data Transformation

In [None]:
#Tokenization
def tokenize_text(kalimat):
    # Tokenizes the input sentence into words
    tokens = nltk.tokenize.word_tokenize(kalimat)
    return tokens

df['token'] = df['desc'].apply(tokenize_text)
df

In [None]:
df['token']

In [None]:
#Token Translation
translator = Translator()
def translate_tokens(tokens):
    translated_tokens = []
    for token in tokens:
        try:
            lang = detect(token)
            if lang != 'id':  # Jika bukan bahasa Indonesia
                translated_token = translator.translate(token, src=lang, dest='id').text
            else:
                translated_token = token
        except:
            translated_token = token  # Jika deteksi bahasa gagal, tetap menggunakan token asli
        translated_tokens.append(translated_token)
    return translated_tokens

df['translated_token'] = df['token'].apply(translate_tokens)
df.head()



In [None]:
dataset = pd.read_csv('DM_2.csv')
df = dataset

In [None]:
df.head()

In [None]:
def process_and_tokenize_text(text):
    # Data Cleaning
    text = text.lower()  # Mengubah huruf ke huruf kecil
    text = re.sub('@[^\s]+', '', text)  # Menghapus @
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)  # Menghapus URL
    text = re.sub(r"\d+", " ", str(text))  # Menghapus angka
    text = re.sub('&quot;', " ", text)  # Menghapus entity HTML
    text = re.sub(r"\b[a-zA-Z]\b", "", str(text))  # Menghapus karakter tunggal
    text = re.sub(r"[^\w\s]", " ", str(text))  # Menghapus tanda baca
    text = re.sub(r'(.)\1+', r'\1\1', text)  # Menghapus karakter yang diulang
    text = re.sub(r"\s+", " ", str(text))  # Menghapus spasi berlebih
    
    # Tokenization
    tokens = nltk.tokenize.word_tokenize(text)
    return tokens

df['translated_token'] = df['translated_token'].apply(process_and_tokenize_text)
df

In [None]:
#Stopword Removal
factory = StopWordRemoverFactory()

additional = ['yg','mo', 'woi', 'nih', 'sih']

stopwords = factory.get_stop_words()
stopwords = stopwords + additional
print(stopwords)

In [None]:
def stopword_removal(tokens):
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords:
            cleaned_tokens.append(token)
    return cleaned_tokens

df['stop'] = df['translated_token'].apply(stopword_removal)
df

In [None]:
#Stemming
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

def stemming_text(tokens):
    hasil = [stemmer.stem(token) for token in tokens]
    return hasil

df['stem'] = df['stop'].apply(stemming_text)
df

In [None]:
#Remove duplicate tokens
def remove_duplicates(tokens):
    return list(dict.fromkeys(tokens))

df['clean_token'] = df['stem'].apply(remove_duplicates)
df.head()

In [None]:
#Mengubah token jadi teks
def convert(token_str):
    try:
        if isinstance(token_str, str):            
            clean_str = token_str.replace("'", '"')
            token_list = ast.literal_eval(clean_str)
            return ' '.join(token_list)
        else:            
            return ' '.join(token_str)
    except Exception as e:        
        print(f"Error converting: {token_str}\nException: {e}")
        return ""


df['clean_text'] = df['clean_token'].apply(convert)

In [None]:
df.head()

Data Mining

In [None]:
svm = SVC(kernel='linear', C=1, random_state=42)
logistic = LogisticRegression(random_state=42, max_iter=1000)
naive_bayes = MultinomialNB()
def evaluate_model(clf, X, y, splits, le):
    # Initialize dictionaries to store metrics for each label
    label_metrics = {label: {'precision': [], 'recall': [], 'f1_score': []} for label in le.classes_}
    accuracies = []
    weighted_precisions = []
    weighted_recalls = []
    weighted_f1_scores = []

    for train_index, test_index in splits:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Train the model
        clf.fit(X_train, y_train)

        # Predict on the test set
        y_pred = clf.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

        # Get weighted precision, recall, and F1 score
        weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(
            y_test, y_pred, average='weighted'
        )
        
        # Store weighted metrics
        weighted_precisions.append(weighted_precision)
        weighted_recalls.append(weighted_recall)
        weighted_f1_scores.append(weighted_f1)

        # Get precision, recall, and F1 score for each label (without specifying 'average')
        per_label_precision, per_label_recall, per_label_f1, _ = precision_recall_fscore_support(
            y_test, y_pred, labels=range(len(le.classes_)), average=None
        )

        # Store metrics for each label
        for i, label in enumerate(le.classes_):
            label_metrics[label]['precision'].append(per_label_precision[i])
            label_metrics[label]['recall'].append(per_label_recall[i])
            label_metrics[label]['f1_score'].append(per_label_f1[i])

        # Print classification report for each fold
        # report = classification_report(y_test, y_pred, target_names=le.classes_)
        # print(f"Fold report:\n{report}")

    # Calculate and print the average for each label
    for label in le.classes_:
        avg_precision = np.mean(label_metrics[label]['precision'])
        avg_recall = np.mean(label_metrics[label]['recall'])
        avg_f1 = np.mean(label_metrics[label]['f1_score'])

        print(f"{label} - Average Precision: {avg_precision:.3f}")
        print(f"{label} - Average Recall: {avg_recall:.3f}")
        print(f"{label} - Average F1-Score: {avg_f1:.3f}")
        print()  # Blank line for readability

    # Calculate and display the overall weighted precision, recall, F1-score, and accuracy
    overall_weighted_precision = np.mean(weighted_precisions)
    overall_weighted_recall = np.mean(weighted_recalls)
    overall_weighted_f1_score = np.mean(weighted_f1_scores)
    overall_average_accuracy = np.mean(accuracies)

    print(f"Overall Weighted Precision: {overall_weighted_precision:.3f}")
    print(f"Overall Weighted Recall: {overall_weighted_recall:.3f}")
    print(f"Overall Weighted F1-Score: {overall_weighted_f1_score:.3f}")
    print(f"Overall Average Accuracy: {overall_average_accuracy:.3f}")

TF-IDF

In [None]:
# Separate features and target variable
X = df['clean_text']
y = df['category']

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

# Encode labels for stratification
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Apply Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
splits = list(skf.split(X_tfidf, y_encoded))
    
# Apply RandomOverSampler to balance the classes
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_tfidf, y)

# Encode labels for stratification
y_encoded_ros = le.fit_transform(y_ros)

# Apply Stratified K-Fold cross-validation with RandomOverSampler
splits_ros = list(skf.split(X_ros, y_encoded_ros))

# Evaluate models with RandomOverSampler
print("\nEvaluating models with TF-IDF...")

# Evaluate SVM
print("Evaluating SVM...\n")
evaluate_model(svm, X_ros, y_encoded_ros, splits_ros, le)

# Evaluate Logistic Regression
print("\nEvaluating Logistic Regression...\n")
evaluate_model(logistic, X_ros, y_encoded_ros, splits_ros, le)

# Evaluate Naive Bayes
print("\nEvaluating Naive Bayes...\n")
evaluate_model(naive_bayes, X_ros, y_encoded_ros, splits_ros, le)

In [None]:
# Menghitung frekuensi kemunculan tiap kategori
y_encoded_distribution = np.bincount(y_encoded)
y_encoded_distribution_df = pd.DataFrame({'Category': le.classes_, 'Frequency': y_encoded_distribution})

# Menampilkan persebaran dalam bentuk tabel
print("Persebaran y_encoded:")
print(y_encoded_distribution_df)

# Membuat grafik bar
plt.figure(figsize=(10, 6))
plt.bar(y_encoded_distribution_df['Category'], y_encoded_distribution_df['Frequency'], color='skyblue')
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.title('Distribution of Categories in y_encoded')
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Menghitung frekuensi kemunculan tiap kategori setelah RandomOverSampler
y_encoded_ros_distribution = np.bincount(y_encoded_ros)
y_encoded_ros_distribution_df = pd.DataFrame({'Category': le.classes_, 'Frequency': y_encoded_ros_distribution})

# Menampilkan persebaran dalam bentuk tabel
print("\nPersebaran y_encoded_ros:")
print(y_encoded_ros_distribution_df)

# Membuat grafik bar
plt.figure(figsize=(10, 6))
plt.bar(y_encoded_ros_distribution_df['Category'], y_encoded_ros_distribution_df['Frequency'], color='lightgreen')
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.title('Distribution of Categories in y_encoded_ros (After RandomOverSampler)')
plt.xticks(rotation=45, ha='right')
plt.show()

BoW

In [None]:
# Separate features and target variable
X = df['clean_text']
y = df['category']

# Convert text data to BoW features
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)

# Encode labels for stratification
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Apply Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
splits = list(skf.split(X_bow, y_encoded))

# Apply RandomOverSampler to balance the classes
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_bow, y)

# Encode labels for stratification
y_encoded_ros = le.fit_transform(y_ros)

# Apply Stratified K-Fold cross-validation with RandomOverSampler
splits_ros = list(skf.split(X_ros, y_encoded_ros))

# Evaluate models with RandomOverSampler
print("\nEvaluating models with BoW...")

# Evaluate SVM
print("Evaluating SVM...\n")
evaluate_model(svm, X_ros, y_encoded_ros, splits_ros, le)

# Evaluate Logistic Regression
print("\nEvaluating Logistic Regression...\n")
evaluate_model(logistic, X_ros, y_encoded_ros, splits_ros, le)

# Evaluate Naive Bayes
print("\nEvaluating Naive Bayes...\n")
evaluate_model(naive_bayes, X_ros, y_encoded_ros, splits_ros, le)