In [2]:
import numpy as np
import pandas as pd
from hazm import stopwords_list

In [3]:
import re
import pandas as pd
from hazm import stopwords_list
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

# Load dataset
df = pd.read_csv('../data/taghche.csv')

# Create 'category' column
df['category'] = df['rate'].apply(lambda x: 'BAD' if x < 3 else ('NEUTRAL' if x < 5 else 'GOOD'))

# Load stop words
stop_words = stopwords_list()

def undersample(x, y):
    rus = RandomUnderSampler(random_state=0)
    x_under, y_under = rus.fit_resample(x, y)
    return pd.concat([x_under, y_under], axis=1)

def preprocess(text):
    text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
    words = text.split()
    return words

def calculate_margin(row, category_counts):
    max_val = row[['GOOD', 'BAD', 'NEUTRAL']].max()
    min_val = row[['GOOD', 'BAD', 'NEUTRAL']].min()
    max_cat = row[['GOOD', 'BAD', 'NEUTRAL']].idxmax()
    min_cat = row[['GOOD', 'BAD', 'NEUTRAL']].idxmin()
    
    max_freq = (max_val / category_counts[max_cat]) * 100
    min_freq = (min_val / category_counts[min_cat]) * 100
    
    margin = max_freq - min_freq
    return margin

# Initialize counters for each category
category_counters = {'GOOD': Counter(), 'BAD': Counter(), 'NEUTRAL': Counter()}

# Apply undersampling
df_sampled = undersample(x=df[['comment']], y=df[['category']])

# Drop any rows with NaN values in 'category'
df_sampled = df_sampled.dropna(subset=['category'])

# Count the number of comments in each category
category_counts = df_sampled['category'].value_counts().to_dict()

# Process each comment
for index, row in df_sampled.iterrows():
    comment = row['comment']
    category = row['category']
    if isinstance(comment, str):
        words = preprocess(comment)
        stop_words_in_comment = [word for word in words if word in stop_words]
        category_counters[category].update(stop_words_in_comment)

# Create a DataFrame to hold the results
all_stop_words = list(set(stop_words))
result_df = pd.DataFrame(index=all_stop_words, columns=['BAD', 'NEUTRAL', 'GOOD'])
result_df = result_df.fillna(0)  # Fill with zeroes

# Fill the DataFrame with the counts
for category, counter in category_counters.items():
    for word, freq in counter.items():
        result_df.at[word, category] = freq

# Reset index to have 'Stop Word' as a column
result_df = result_df.reset_index().rename(columns={'index': 'Stop Word'})

# Calculate the margin for each stop word using the new formula
result_df['Margin'] = result_df.apply(calculate_margin, axis=1, category_counts=category_counts)


result_df.to_csv('../data/stop_words_frequency_with_margin.csv', index=False, encoding='utf-8')
result_df

  result_df = result_df.fillna(0)  # Fill with zeroes


Unnamed: 0,Stop Word,BAD,NEUTRAL,GOOD,Margin
0,گرفته,176,167,113,0.454283
1,یافته,16,12,20,0.057687
2,خوبی,644,1546,773,6.504182
3,دو,501,615,385,1.658494
4,لذا,12,15,5,0.072108
...,...,...,...,...,...
384,نخست,7,7,8,0.007211
385,شمار,9,9,4,0.036054
386,کنید,533,530,544,0.100952
387,هستیم,44,52,53,0.064898


In [7]:
import pandas as pd
from collections import Counter
import re

df = pd.read_csv('../data/taghche.csv')
df['category'] = df['rate'].apply(lambda x: 'BAD' if x < 3 else ('NEUTRAL' if x < 5 else 'GOOD'))
stop_words = stopwords_list()

def preprocess(text):
    words = text.split()
    return words

def calculate_margin(row, comment_counts):
    avg_freq = row[['GOOD', 'BAD', 'NEUTRAL']].mean()
    max_margin = (row[['GOOD', 'BAD', 'NEUTRAL']] - avg_freq).abs().max()
    stop_word = row['Stop Word']
    return max_margin / comment_counts[stop_word]*100 if comment_counts[stop_word] > 0 else 0


category_counters = {'GOOD': Counter(), 'BAD': Counter(), 'NEUTRAL': Counter()}
comment_counts = Counter()

for index, row in df.iterrows():
    comment = row['comment']
    category = row['category']
    if isinstance(comment, str):
        words = preprocess(comment)
        stop_words_in_comment = [word for word in words if word in stop_words]
        category_counters[category].update(stop_words_in_comment)
        unique_stop_words_in_comment = set(stop_words_in_comment)
        comment_counts.update(unique_stop_words_in_comment)

all_stop_words = list(set(stop_words)) 
result_df = pd.DataFrame(index=all_stop_words, columns=['BAD', 'NEUTRAL', 'GOOD'])
result_df = result_df.fillna(0)  # Fill with zeroes

for category, counter in category_counters.items():
    for word, freq in counter.items():
        result_df.at[word, category] = freq

result_df = result_df.reset_index().rename(columns={'index': 'Stop Word'})

result_df['Margin'] = result_df.apply(lambda row: calculate_margin(row, comment_counts), axis=1)
result_df.to_csv('../data/stop_words_frequency_with_margin.csv', index=False, encoding='utf-8')
result_df


  result_df = result_df.fillna(0)  # Fill with zeroes


Unnamed: 0,Stop Word,BAD,NEUTRAL,GOOD,Margin
0,گرفته,155,170,277,13.778580
1,یافته,15,13,48,31.050228
2,خوبی,640,1812,2312,21.545455
3,دو,489,713,1097,17.027120
4,لذا,12,16,14,5.128205
...,...,...,...,...,...
384,نخست,6,10,23,27.027027
385,شمار,9,10,17,14.285714
386,کنید,382,402,1038,26.292226
387,هستیم,26,41,117,30.755064
