## Extract data(split columns)

In [4]:
import pandas as pd
import numpy as np
import re


initial_data = pd.read_csv("Snappfood.csv", sep="\t", encoding="utf-8-sig")
filtered_data = initial_data.dropna(subset=["comment", "label", "label_id"])
new_data = pd.DataFrame()
new_data["comment"] = filtered_data["comment"].dropna().values
new_data["label"] = filtered_data["label"].str.lower().dropna().values
new_data["label_id"] = filtered_data["label_id"].dropna().values.astype(np.int8)

new_data.to_csv("new_data.csv",encoding="utf-8-sig" ,index=False)


## Data cleaning (remove words with one character adn other signs like: ! ?)

In [5]:
def is_persian(word):
    return bool(re.fullmatch(r'[آ-یءئاآبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهی]+', word))


temp_data = pd.read_csv("new_data.csv", encoding="utf-8-sig")
sad_data = temp_data[(temp_data["label_id"] == 1)]

happy_data = temp_data[(temp_data["label_id"] == 0)]
sad_temp = sad_data["comment"].str.split()

happy_temp = happy_data["comment"].str.split()
happy_flat_words = np.hstack(happy_temp.dropna().tolist())
sad_flat_words = np.hstack(sad_temp.dropna().tolist())

happy_persian = [w for w in happy_flat_words if is_persian(w)]
sad_persian = [w for w in sad_flat_words if is_persian(w)]


## Count repetition of each words in all words and save in separate csv files

In [6]:
unique_words, counts = np.unique(happy_persian, return_counts=True)
happy_df = pd.DataFrame({
    "word": unique_words,
    "count_happy": counts
})
unique_words, counts = np.unique(sad_persian, return_counts=True)
sad_df = pd.DataFrame({
    "word": unique_words,
    "count_sad": counts
})
happy_df.to_csv("happy_words.csv", encoding="utf-8-sig", index=False)
sad_df.to_csv("sad_words.csv", encoding="utf-8-sig", index=False)


## Calculate percentage of repetition of each words

In [7]:
happy_words = pd.read_csv("happy_words.csv")
sad_words = pd.read_csv("sad_words.csv")

result_df = pd.merge(happy_words, sad_words, on="word", how="outer")
result_df = result_df.fillna(0)

total_happy_words = happy_words["count_happy"].sum()
total_sad_words = sad_words["count_sad"].sum()

result_df["percent_happy"] = ((result_df["count_happy"] / total_happy_words) * 100).round(3)
result_df["percent_sad"] = ((result_df["count_sad"] / total_sad_words) * 100).round(3)
result_df["diff"] = (abs(result_df["percent_happy"] - result_df["percent_sad"])).round(3)

##  Setting a threshold to find them and remove some exceptions

In [8]:
digits_happy = result_df["count_happy"].astype(int).astype(str).str.len()
digits_sad = result_df["count_sad"].astype(int).astype(str).str.len()

value_diff = abs(result_df["count_happy"] - result_df["count_sad"])

remove_condition = (
    ((digits_happy == 4) & (digits_sad == 5)) |
    ((digits_happy == 5) & (digits_sad == 5)) |
    ((digits_happy == 4) & (digits_sad == 4) & (value_diff < 500))

)

min_diff = 0.08

condition = (
    ~remove_condition &
    (result_df["diff"] > min_diff)
)


filtered = result_df[condition]

## Save output and display output in console

In [9]:
filtered.to_csv("final_result.csv", encoding="utf-8-sig", index=False)
print(filtered.sort_values("percent_happy", ascending=False).head(50)["word"].tolist())
print(filtered.sort_values("percent_sad", ascending=False).head(50)["word"].tolist())

['خیلی', 'عالی', 'خوب', 'غذا', 'ولی', 'خوشمزه', 'ممنون', 'فقط', 'سفارش', 'این', 'شده', 'رو', 'همیشه', 'رسید', 'من', 'تازه', 'پیتزا', 'مثل', 'همه', 'پیک', 'گرم', 'سریع', 'موقع', 'نبود', 'یک', 'سرد', 'تا', 'ممنونم', 'مرغ', 'تحویل', 'خوش', 'داغ', 'دستم', 'مزه', 'تشکر', 'کل', 'فود', 'چیز', 'بهتر', 'زمان', 'کمی', 'کاملا', 'فوق', 'زود', 'اصلا', 'چی', 'مرسی', 'یکم', 'العاده', 'کیک']
['غذا', 'خیلی', 'سفارش', 'شده', 'این', 'رو', 'اصلا', 'من', 'سرد', 'پیتزا', 'ولی', 'نبود', 'بد', 'خوب', 'یک', 'مرغ', 'کاملا', 'بی', 'تا', 'تحویل', 'مزه', 'متاسفانه', 'فقط', 'پایین', 'ساعت', 'دیگه', 'پیک', 'نداشت', 'داده', 'دادم', 'رسید', 'مونده', 'بعد', 'کیک', 'گوشت', 'بار', 'پنیر', 'دو', 'کرده', 'تازه', 'خشک', 'قابل', 'نه', 'همیشه', 'تاخیر', 'هیچ', 'ریخته', 'سوخته', 'همه', 'خمیر']
