In [1]:
!pip install -q numpy pandas scikit-learn matplotlib
!pip install -q sentence-transformers
!pip install -q umap-learn

In [2]:
import pandas as pd

DATA_PATH='data.csv'

df=pd.read_csv(DATA_PATH)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'

In [None]:
df.info()

In [None]:
df['Suggestion'].value_counts().sort_index()

In [None]:
def score_to_label(score):
    if score <40:
       return "negetive"
    elif score>=60:
       return "positive"
    else:
        return "neutral"
df['label']=df['Score'].apply(score_to_label)
df = df[df['label'] != 'neutral'].reset_index(drop=True)
df['label'].value_counts()

In [None]:
import re

# الگوی اموجی‌ها (ساده)
emoji_pattern = re.compile("""["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
"]+""", flags=re.UNICODE)

def normalize_arabic_chars(text: str) -> str:
    # ي/ی و ك/ک
    text = text.replace('ي', 'ی').replace('ى', 'ی')
    text = text.replace('ك', 'ک')
    return text

def convert_persian_digits(text: str) -> str:
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    english_digits = '0123456789'
    trans_table = str.maketrans(''.join(persian_digits), ''.join(english_digits))
    return text.translate(trans_table)

def clean_persian_text(text: str) -> str:
    if not isinstance(text, str):
        return ""

    text = normalize_arabic_chars(text)
    text = convert_persian_digits(text)

    # حذف URLها
    text = re.sub(r'http\S+|www\.\S+', ' ', text)

    # حذف ایمیل
    text = re.sub(r'\S+@\S+', ' ', text)

    # حذف @mention و هشتگ
    text = re.sub(r'[@#]\S+', ' ', text)

    # حذف اموجی‌ها
    text = emoji_pattern.sub(' ', text)

    # حذف هرچیزی به جز حروف و اعداد و فاصله
    text = re.sub(r'[^۰-۹0-9آ-یئءچژگۀ۱۲۳۴۵۶۷۸۹\s]', ' ', text)

    # تبدیل چند فاصله به یکی
    text = re.sub(r'\s+', ' ', text).strip()

    # کوچک‌سازی (در فارسی حساسیت کمتری دارد)
    text = text.lower()

    return text

df['clean_text'] = df['Text'].astype(str).apply(clean_persian_text)
df[['Text', 'clean_text']].head(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

tfidf_vectorixer= TfidfVectorizer(ngram_range=(1,2),
                                  min_df=5,
                                  max_df=0.9)

y=df['label']

X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], y, test_size=0.2,random_state=42, stratify=y)

X_train = tfidf_vectorixer.fit_transform(X_train)
X_test = tfidf_vectorixer.transform(X_test)
X_train.shape


In [None]:
type(X_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def accuracy(str,cl,X_test,y_test):
  y_pred = cl.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)

  # Precision
  precision = precision_score(y_test, y_pred, average='weighted')

  # Recall
  recall = recall_score(y_test, y_pred, average='weighted')

  # F1-score
  f1 = f1_score(y_test, y_pred, average='weighted')

  # Confusion matrix
  cm = confusion_matrix(y_test, y_pred)

  # گزارش کامل
  report = classification_report(y_test, y_pred)

  print(f"--------{str} scores-----")
  print(f"-------------------------")
  print("Accuracy :", accuracy)
  print("Precision:", precision)
  print("Recall   :", recall)
  print("F1-score :", f1)
  print("\nConfusion Matrix:\n", cm)
  print("\nClassification Report:\n", report)


In [None]:
#TF-IDF
cl= LogisticRegression(max_iter=3000)
cl.fit(X_train,y_train)
accuracy("LogisticRegression",cl,X_test,y_test)

svm= LinearSVC()
svm.fit(X_train,y_train)
accuracy("LinearSVC",svm,X_test,y_test)

rf= RandomForestClassifier(n_estimators=300)
rf.fit(X_train,y_train)
accuracy("RandomForestClassifier",rf,X_test,y_test)

In [None]:
from sentence_transformers import SentenceTransformer
print("Loading BGE-M3 model...")
emb_model = SentenceTransformer("BAAI/bge-m3")
embeddings = emb_model.encode(
df['clean_text'].tolist(),
batch_size=32,
show_progress_bar=True
)
embeddings.shape


In [None]:
X_train_bge, X_test_bge, y_train_bge, y_test_bge = train_test_split(embeddings, y, test_size=0.2,random_state=42, stratify=y)


print(X_train_bge.shape)
print(X_test_bge.shape)

In [None]:
#BGE-M3
clbge= LogisticRegression(max_iter=3000)
clbge.fit(X_train_bge,y_train_bge)
accuracy("LogisticRegression", clbge, X_test_bge, y_test_bge)

svmbge= LinearSVC()
svmbge.fit(X_train_bge,y_train_bge)
accuracy("LinearSVC", svmbge, X_test_bge, y_test_bge)

rfbge= RandomForestClassifier(n_estimators=300)
rfbge.fit(X_train_bge, y_train_bge)
accuracy("RandomForestClassifier",rfbge, X_test_bge, y_test_bge)

In [None]:
from openai import OpenAI

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key="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",
)
batch_size = 100
embeddings_list = []

for i in range(0, len(df), batch_size):
    batch_texts = df['clean_text'].iloc[i:i+batch_size].tolist()
    resp = client.embeddings.create(
        model="openai/text-embedding-3-small",
        input=batch_texts
    )
    embeddings_list.extend([d.embedding for d in resp.data])

In [None]:
print(embeddings_list[0])

In [None]:
X_train_openai, X_test_openai, y_train_openai, y_test_openai = train_test_split(np.array(embeddings_list), y, test_size=0.2,random_state=42, stratify=y)


print(X_train_openai.shape)
print(X_test_openai.shape)

In [None]:
X_train_openai[0]

In [None]:
#openAI
clopenai= LogisticRegression(max_iter=3000, class_weight='balanced')
clopenai.fit(X_train_openai,y_train_openai)
accuracy("LogisticRegression", clopenai, X_test_openai, y_test_openai)

svmopenai= LinearSVC()
svmopenai.fit(X_train_openai,y_train_openai)
accuracy("LinearSVC", svmopenai, X_test_openai, y_test_openai)

rfopenai= RandomForestClassifier(n_estimators=300)
rfopenai.fit(X_train_openai,y_train_openai)
accuracy("RandomForestClassifier",rfopenai, X_test_openai, y_test_openai)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import lightgbm  as lgb


In [None]:
#BGE-M3
le = LabelEncoder()
y_train_bge_enc = le.fit_transform(y_train_bge)
y_test_bge_enc  = le.transform(y_test_bge)


knnbge = KNeighborsClassifier(n_neighbors=5)
knnbge.fit(X_train_bge,y_train_bge_enc)
accuracy("KNeighborsClassifier", knnbge, X_test_bge, y_test_bge_enc)


mlpbge  = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=500, random_state=42)
mlpbge .fit(X_train_bge,y_train_bge_enc)
accuracy("MLPClassifier", mlpbge , X_test_bge, y_test_bge_enc)


xgbbge = XGBClassifier(n_estimators=300, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgbbge.fit(X_train_bge,y_train_bge_enc)
accuracy("XGBClassifier", xgbbge, X_test_bge, y_test_bge_enc)


lgbmbge = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, max_depth=7, random_state=42)
lgbmbge.fit(X_train_bge,y_train_bge_enc)
accuracy("LGBMClassifier", lgbmbge, X_test_bge, y_test_bge_enc)

In [None]:
#OpenAI
le = LabelEncoder()
y_train_openai_enc = le.fit_transform(y_train_openai)
y_test_openai_enc  = le.transform(y_test_openai)


knnopenai = KNeighborsClassifier(n_neighbors=5)
knnopenai.fit(X_train_openai,y_train_openai_enc)
accuracy("KNeighborsClassifier", knnopenai, X_test_openai, y_test_openai_enc)


mlpopenai  = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=500, random_state=42)
mlpopenai .fit(X_train_openai,y_train_openai_enc)
accuracy("MLPClassifier", mlpopenai , X_test_openai, y_test_openai_enc)


xgbopenai = XGBClassifier(n_estimators=300, learning_rate=0.05, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgbopenai.fit(X_train_openai,y_train_openai_enc)
accuracy("XGBClassifier", xgbopenai, X_test_openai, y_test_openai_enc)


lgbmopenai = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, max_depth=7, random_state=42)
lgbmopenai.fit(X_train_openai,y_train_openai_enc)
accuracy("LGBMClassifier", lgbmopenai, X_test_openai, y_test_openai_enc)

In [None]:
def accuracy_alltogether(names,models,X_test,y_test):

  for name, model in zip(names, models):
    y_pred = cl.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Precision
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)

    # Recall
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)

    # F1-score
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    print(f"{name:30} {accuracy:<10.4f} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f}")


In [None]:
print(f"{'Model':30} {'Accuracy':10} {'Precision':10} {'Recall':10} {'F1-score':10}")
print("-" * 80)
names = ["LogisticRegression-TFIDF", "LinearSVC-TFIDF", "RandomForestClassifier-TFIDF"]
models = [cl, svm, rf]

accuracy_alltogether(names,models,X_test,y_test)

print("-" * 80)

names = ["LogisticRegression-BGE", "LinearSVC-BGE", "RandomForestClassifier-BGE", "KNeighborsClassifier-BGE", "MLPClassifier-BGE", "XGBClassifier-BGE",  "LGBMClassifier-BGE"]
models = [clbge, svmbge, rfbge, knnbge, mlpbge, xgbbge, lgbmbge]

accuracy_alltogether(names,models, X_test_bge, y_test_bge_enc)

names = ["LogisticRegression-OpenAI", "LinearSVC-OpenAI", "RandomForestClassifier-OpenAI", "KNeighborsClassifier-OpenAI", "MLPClassifier-OpenAI", "XGBClassifier-OpenAI",  "LGBMClassifier-OpenAI"]
models = [clopenai, svmopenai, rfopenai, knnopenai, mlpopenai, xgbopenai, lgbmopenai]

accuracy_alltogether(names,models, X_test_openai, y_test_openai_enc)
