## Turkish Product Reviews

### Dataset Preprocessing

In [None]:
import os
import pandas as pd
import numpy as np
from datasets import load_dataset

from src.utils import get_project_config
from src.utils import missing_values
import time
from datetime import datetime
import warnings

# import matplotlib.pyplot as plt
# import seaborn

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

warnings.filterwarnings('ignore')

#### Set Path and Constant Values

In [None]:
# Set Constants
ROOT_PATH = os.getcwd()
CFG_PATH = os.path.join(ROOT_PATH, 'cfg')
DATA_PATH = os.path.join(ROOT_PATH, 'data')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')
EMBEDDING_DATA_PATH = os.path.join(DATA_PATH, 'embedding')

os.makedirs(RAW_DATA_PATH, exist_ok=True)
os.makedirs(EMBEDDING_DATA_PATH, exist_ok=True)

### Import Dataset: HepsiBurada Product Reviews

In [None]:
df_hb_data_all = pd.read_parquet("hf://datasets/fthbrmnby/turkish_product_reviews/data/train-00000-of-00001.parquet")
df_hb_data_all.to_csv(os.path.join(RAW_DATA_PATH, "hb_tr_product_reviews.csv.gz"), index=False, compression='gzip')
# dataset = load_dataset("fthbrmnby/turkish_product_reviews", split="train")
# df_hb_data_all = dataset.to_pandas()

df_hb_data_all.head()

### Show Metadata, Shape, Statistics and Missing Table

In [None]:
df_hb_data_all.info()

In [None]:
df_hb_data_all.shape

In [None]:
df_hb_data_all['sentiment'].value_counts(normalize=True)

In [None]:
missing_values(df=df_hb_data_all, threshold=0, asc_sorting=False)

In [None]:
df_hb_data_all.head(5)

### Split Data

In [None]:
# Neden Yapıyoruz
# 1- Balanced Dataset
# 2- Computing Power

In [None]:
df_hb_data_1 = df_hb_data_all[df_hb_data_all['sentiment'] == 1].sample(n=3750, random_state=34)

df_hb_data_0 = df_hb_data_all[df_hb_data_all['sentiment'] == 0].sample(n=3750, random_state=34)

df_hb_data_balanced = pd.concat([df_hb_data_1, df_hb_data_0])

df_hb_data = df_hb_data_balanced.sample(frac=1, random_state=34).reset_index(drop=True)

# Sınıf dağılımını kontrol edin
print(df_hb_data['sentiment'].value_counts())

In [None]:
df_hb_data["sentenceLength"] = df_hb_data["sentence"].apply(len)
df_hb_data["sentenceWordCount"] = df_hb_data['sentence'].str.split().str.len()

In [None]:
df_hb_data.head(10)

In [None]:
df_hb_data[["sentenceWordCount"]].describe(percentiles=[0.80,0.85,0.90,0.95,0.99]).T

In [None]:
df_hb_data.groupby(by=['sentiment'], as_index=False).agg(SentimentWordCount_Mean=('sentenceWordCount','mean'),
                                                         SentimentLength_Mean=('sentenceLength','mean'))

### Embedding

In [None]:
df = df_hb_data[['sentence', 'sentiment']]
df[0:5]

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = [
    'sentence-transformers/all-MiniLM-L12-v2',
    "jinaai/jina-embeddings-v3",
    "intfloat/multilingual-e5-large-instruct",
    "BAAI/bge-m3",
    "thenlper/gte-large"
]

In [None]:
for model_name in embedding_model:
    try:
        embedding_duration = datetime.now()
        print(f"Embedding Model Name: {model_name} - {datetime.now()} ")
        model = SentenceTransformer(model_name, trust_remote_code=True)
        model_output_file = os.path.join(EMBEDDING_DATA_PATH, f"{"hb_" + model_name.replace('/', '__')}_embeddings.csv.gz")
        embeddings = model.encode(df['sentence'][0:5].tolist())
        embedding_dim = len(embeddings[0])
        print(f"Embedding Dimension: {embedding_dim}")
        embedding_columns = [f'embedding_{i}' for i in range(embedding_dim)]
        df_embedding = pd.DataFrame(embeddings, columns=embedding_columns)
        df_data = pd.concat([df[0:5], df_embedding], axis=1)
        df_data.to_csv(model_output_file, compression='gzip', index=False)
        print(f"Embedding Duration: {datetime.now() - embedding_duration}")
        print("- " * 15)
    except Exception as exc:
        print("Exception: ", exc)

In [None]:
df_emb_data_1 = pd.read_csv(os.path.join(EMBEDDING_DATA_PATH, "hb_sentence-transformers__all-MiniLM-L12-v2_embeddings.csv.gz"), compression='gzip')
df_emb_data_1

df_emb_data_2 = pd.read_csv(os.path.join(EMBEDDING_DATA_PATH, "hb_jinaai__jina-embeddings-v3_embeddings.csv.gz"), compression='gzip')
df_emb_data_2