# SVM, NB, LR vs.

In [2]:
import pandas as pd
import re
import emoji

Setting dataset

In [None]:
# For Classic ML Models
df_processed = pd.read_csv("../data/processed/restaurant_reviews_processed.csv")
df_ml = df_processed[['text', 'sentiment']].copy()

# Turkish stopwords list
stop_words_tr = [
    'acaba', 'ama', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu',
    'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi',
    'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki',
    'kim', 'mı', 'mi', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede',
    'nereye', 'niçin', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya',
    'ya', 'yani', 'bir', 'içinde', 'altı', 'önünde', 'arasında', 'üzerinde',
    'göre', 'kadar', 'sonra', 'önce', 'olarak', 'kendi', 'ancak', 'madem',
    'halbuki', 'zira', 'yine', 'falan', 'filan', 'herkes', 'hiçkimse', 'bütün',
    'çoğu', 'fazla', 'bile', 'dahi', 'üstelik', 'yalnız', 'değil',
    'onlar', 'ben', 'sen', 'kendisi', 'birkaçı', 'bazılarını', 'bazılarının',
    'kendisini', 'kendisiyle', 'kendisinin', 'karşı', 'karşısında', 'dışında',
    'altında', 'arkasında', 'yanında', 'olan', 'olanlar', 'olanların', 'olanlarla',
    'şunlar', 'şunun', 'şuna', 'şunu', 'şeyler', 'şeylerin', 'şeylerle', 'şeylerdir',
    'kitap', 'sayfa', 'bölüm', 'yazar', 'yazarı', 'roman', 'öykü', 'hikaye',
    'eser', 'konu', 'anlatım', 'tür', 'seri', 'yeni', 'ilk', 'son', 'bölümü',
    'karakter', 'okuyucu', 'okuma', 'büyük', 'küçük', 'dünya', 'yaşanmış',
    'özel', 'üzerine', 'den', 'tarafından'
]

def clean_text_ml(text):
    text = str(text).lower()                           # lowercase
    text = emoji.replace_emoji(text, replace='')       # remove emoji
    text = re.sub(r'[^\w\s]', '', text)               # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()         # remove extra spaces
    text = ' '.join([w for w in text.split() if w not in stop_words_tr])  # stopwords removal
    return text

df_ml['text'] = df_ml['text'].apply(clean_text_ml)

df_ml.to_csv("../data/final/restaurant_reviews_ml.csv", index=False, encoding="utf-8-sig")

# LSTM ve embedding tabanlı modeller için 
df_lstm = df_processed[['text', 'sentiment']].copy()

def clean_text_lstm(text):
    text = str(text).lower()                           # lowercase  
    text = emoji.replace_emoji(text, replace='')       # remove emoji
    text = re.sub(r'\s+', ' ', text).strip()          # remove extra spaces
    return text

df_lstm['text'] = df_lstm['text'].apply(clean_text_lstm)

df_lstm.to_csv("../data/final/restaurant_reviews_lstm.csv", index=False, encoding="utf-8-sig")

print("Cleaned data for ML and LSTM saved!")

Cleaned data for ML and LSTM saved!


# Logistic Regression Notebook (TF-IDF)

In [7]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

df = pd.read_csv("../data/final/restaurant_reviews_ml.csv")  

df['text'] = df['text'].fillna("").astype(str)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['sentiment'],
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment']
)

vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,3)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Logistic Regression Model
lr_model = LogisticRegression(max_iter=1000, C=2.0)
lr_model.fit(X_train_tfidf, y_train)
y_pred = lr_model.predict(X_test_tfidf)

# Performance Evaluation
classes = sorted(df['sentiment'].unique())
report = classification_report(
    y_test, y_pred,
    target_names=[str(c) for c in classes],
    output_dict=True
)

df_report = pd.DataFrame(report).iloc[:-1,:].T 
print(df_report)

# Confusion Matrix + Accuracy Calculation
cm = confusion_matrix(y_test, y_pred, labels=classes)
acc = np.trace(cm) / np.sum(cm)

disp = ConfusionMatrixDisplay(cm, display_labels=classes)
fig, ax = plt.subplots(figsize=(6,6))
disp.plot(cmap=plt.cm.Blues, ax=ax)
plt.title(f"Logistic Regression Confusion Matrix\nOverall Accuracy: {acc:.2f}")
plt.savefig("../outputs/lr_confusion_matrix.png")
plt.close()

# Precision / Recall / F1 Bar Graph
df_report[['precision','recall','f1-score']].plot(kind='bar', figsize=(8,5))
plt.title("Logistic Regression - Precision / Recall / F1-score")
plt.ylim(0,1)
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.tight_layout()
plt.savefig("../outputs/lr_prf.png")
plt.close()

# Save FULL Classification Report (including accuracy row)
full_report = pd.DataFrame(report).T
full_report.to_csv("../outputs/lr_classification_report.csv",
                   index=True, encoding="utf-8-sig")

print("Full LR classification report saved to '../outputs/lr_classification_report.csv'")

              precision    recall  f1-score
0              0.911937  0.923372  0.917619
1              0.926162  0.915105  0.920601
accuracy       0.919137  0.919137  0.919137
macro avg      0.919049  0.919238  0.919110
weighted avg   0.919224  0.919137  0.919146
Full LR classification report saved to '../outputs/lr_classification_report.csv'


# SVM Notebook (TF-IDF + LinearSVC)

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import os

df = pd.read_csv("../data/final/restaurant_reviews_ml.csv") 
df['text'] = df['text'].fillna("").astype(str)
df = df[df['text'].str.strip() != ""].reset_index(drop=True)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['sentiment'],
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment']
)

vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,3)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# SVM Model
svm_model = LinearSVC(C=1.0, max_iter=10000)
svm_model.fit(X_train_tfidf, y_train)
y_pred = svm_model.predict(X_test_tfidf)

# Full classification report 
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Accuracy
accuracy = report_dict["accuracy"]

# Convert to DataFrame
df_report_full = pd.DataFrame(report_dict).T

# Save classification report
os.makedirs("../outputs", exist_ok=True)
df_report_full.to_csv("../outputs/svm_classification_report.csv", index=True, encoding="utf-8-sig")
print("Full classification report saved to '../outputs/svm_classification_report.csv'")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=["negative", "positive"])
fig, ax = plt.subplots(figsize=(6,6))
disp.plot(cmap=plt.cm.Blues, ax=ax)
plt.title(f"SVM Confusion Matrix\nAccuracy: {accuracy:.2f}")
plt.savefig("../outputs/svm_confusion_matrix.png")
plt.close()

# Precision/Recall/F1 Bar Graph 
df_classes_only = df_report_full.iloc[:2][['precision','recall','f1-score']]
df_classes_only.plot(kind='bar', figsize=(8,5))
plt.title("SVM - Precision / Recall / F1-score")
plt.ylim(0,1)
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.tight_layout()
plt.savefig("../outputs/svm_prf.png")
plt.close()

Full classification report saved to '../outputs/svm_classification_report.csv'


# Random Forest Notebook

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import os

# Load dataset
df = pd.read_csv("../data/final/restaurant_reviews_ml.csv")
df['text'] = df['text'].fillna("").astype(str)
df = df[df['text'].str.strip() != ""].reset_index(drop=True)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['sentiment'],
    test_size=0.2,
    random_state=42,
    stratify=df['sentiment']
)

# TF-IDF vectorization (stopwords removed)
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,3)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# RandomForest model
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train_tfidf, y_train)
y_pred = rf_model.predict(X_test_tfidf)

# Full classification report
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Accuracy
accuracy = report_dict["accuracy"]

# Convert to DataFrame
df_report_full = pd.DataFrame(report_dict).T

# Save classification report
os.makedirs("../outputs", exist_ok=True)
df_report_full.to_csv("../outputs/rf_classification_report.csv", index=True, encoding="utf-8-sig")
print(f"Random Forest classification report saved with accuracy: {accuracy:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(cm, display_labels=["negative", "positive"])
fig, ax = plt.subplots(figsize=(6,6))
disp.plot(cmap=plt.cm.Blues, ax=ax)
plt.title(f"Random Forest - Confusion Matrix\nAccuracy: {accuracy:.2f}")
plt.savefig("../outputs/rf_confusion_matrix.png")
plt.close()

# Precision/Recall/F1 Bar Graph 
df_classes_only = df_report_full.iloc[:2][['precision','recall','f1-score']]
df_classes_only.plot(kind='bar', figsize=(8,5))
plt.title("Random Forest - Precision / Recall / F1-score")
plt.ylim(0,1)
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.tight_layout()
plt.savefig("../outputs/rf_prf.png")
plt.close()

Random Forest classification report saved with accuracy: 0.8931


# LSTM Notebook

In [14]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os

df = pd.read_csv("../data/final/restaurant_reviews_lstm.csv")
df['text'] = df['text'].fillna("").astype(str)

# Encode Labels
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])
y = np.array(y)

# Tokenize & Pad
max_words = 15000
max_len = 120

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=max_len, padding='post', truncating='post')

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# LSTM Model 
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=256))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.1)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5)

# Train
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# Evaluate
loss, accuracy = model.evaluate(X_test, y_test, batch_size=64)
print("Test accuracy:", accuracy)

# Predictions
y_pred_prob = model.predict(X_test, batch_size=64)
y_pred = (y_pred_prob.squeeze() > 0.5).astype(int)

# Classification Report
classes = le.classes_
report_dict = classification_report(y_test, y_pred, target_names=classes, output_dict=True)
df_report = pd.DataFrame(report_dict).T

os.makedirs("../outputs", exist_ok=True)
df_report.to_csv("../outputs/lstm_classification_report.csv", index=True, encoding="utf-8-sig")
print("✅ Full classification report saved to '../outputs/lstm_classification_report.csv'")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.title(f"LSTM - Confusion Matrix\nOverall Accuracy: {accuracy:.2f}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.savefig("../outputs/lstm_confusion_matrix.png")
plt.close()

# Precision/Recall/F1 Bar Plot
df_report[['precision','recall','f1-score']].iloc[:-3].plot(kind='bar', figsize=(8,5))
plt.title("LSTM - Precision / Recall / F1-score")
plt.ylim(0,1)
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.tight_layout()
plt.savefig("../outputs/lstm_prf.png")
plt.close()

Epoch 1/10
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 547ms/step - accuracy: 0.8411 - loss: 0.3393 - val_accuracy: 0.9214 - val_loss: 0.2045 - learning_rate: 0.0010
Epoch 2/10
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m405s[0m 550ms/step - accuracy: 0.9432 - loss: 0.1543 - val_accuracy: 0.9219 - val_loss: 0.2135 - learning_rate: 0.0010
Epoch 3/10
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 531ms/step - accuracy: 0.9596 - loss: 0.1082 - val_accuracy: 0.9169 - val_loss: 0.2350 - learning_rate: 0.0010
Epoch 4/10
[1m736/736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 537ms/step - accuracy: 0.9683 - loss: 0.0811 - val_accuracy: 0.9172 - val_loss: 0.2972 - learning_rate: 5.0000e-04
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 120ms/step - accuracy: 0.9226 - loss: 0.2031
Test accuracy: 0.9214304089546204
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 139ms/step
✅ Full class