In [1]:

import pandas as pd


df = pd.read_csv('Snappfood.csv', encoding='utf-8', on_bad_lines='skip', header=None)

split_columns = df[0].str.split('\t', n=2, expand=True)
split_columns.columns = ['label_id', 'comment', 'label']

for col in split_columns.columns:
    split_columns[col] = split_columns[col].str.replace('\t', '', regex=False)


split_columns = split_columns.drop(index=0).reset_index(drop=True)
clean_df = split_columns
clean_df.to_csv('Snappfood_clean.csv', index=False)
saved_df = pd.read_csv('Snappfood_clean.csv')
saved_df[['label', 'label_id']] = saved_df['label'].str.extract(r'([A-Z]+)(\d)')
saved_df.to_csv('Snappfood_final.csv', index=False)
snap_comments = pd.read_csv('Snappfood_final.csv')

def split_words(comments):
    words = comments.split()
    return [f"'{word}'" for word in words]
saved_df['words'] = saved_df['comment'].apply(split_words)
all_words = saved_df['words'].explode()



stop_words = set([

    '!', 'و', 'بود','"اصلا""','به','از','بود.','خیلی','هم','که','با','ولی','در','فقط','این',
    'شده','رو','همیشه','رسید','من','واقعا','همه','مثل','بسیار','بود،','اما','به','تا',
    'بودم','اینکه','داشت','بودن','برای','یک','تو','دیگه','داده','دادم','بعد','به','تا','یه','را'
])

def create_word_label_df(comments, label):
    words = []
    for comment in comments:
        filtered_words = [word for word in comment.split() if word.lower() not in stop_words]
        words.extend(filtered_words)
    return pd.DataFrame({'word': words, 'label': label})





total_length_all = all_words.str.len().sum()
print(f"Total length of all words in all comments: {total_length_all}")

happy_comments = (snap_comments[snap_comments['label'] == 'HAPPY']['comment']).apply(split_words)
all_happy_words = [word for words_list in happy_comments for word in words_list]
lengths_happy = [len(word) for word in all_happy_words]
total_length_happy = sum(lengths_happy)
print(f"Total length of all words in HAPPY comments: {total_length_happy}")


sad_comments = (snap_comments[snap_comments['label'] == 'SAD']['comment']).apply(split_words)
all_sad_words = [word for words_list in sad_comments for word in words_list]
lengths_sad = [len(word) for word in all_sad_words]
total_length_sad = sum(lengths_sad)
print(f"Total length of all words in SAD comments: {total_length_sad}")



happy_df = create_word_label_df(snap_comments[snap_comments['label'] == 'HAPPY']['comment'], 'HAPPY')
sad_df = create_word_label_df(snap_comments[snap_comments['label'] == 'SAD']['comment'], 'SAD')
combined_df = pd.concat([happy_df, sad_df])
word_counts = pd.crosstab(index=combined_df['word'], columns=combined_df['label'])

happy_words_sorted = word_counts['HAPPY'].sort_values(ascending=False)
sad_words_sorted = word_counts['SAD'].sort_values(ascending=False)
print('*' * 50)
print("Most common words in HAPPY comments:")
happy = happy_words_sorted[happy_words_sorted > 0].head(50)
print(happy)
print('*' * 50)
print("\nMost common words in SAD comments:")
sad = sad_words_sorted[sad_words_sorted > 0].head(50)
print(sad)



Total length of all words in all comments: 7558737
Total length of all words in HAPPY comments: 3268537
Total length of all words in SAD comments: 4287420
**************************************************
Most common words in HAPPY comments:
word
عالی       10487
خوب         9712
غذا         6158
کیفیت       5965
خوشمزه      4770
ممنون       4215
سفارش       3485
تازه        3023
پیتزا       2766
طعم         2716
پیک         2526
ارسال       2380
گرم         2332
سریع        2286
کم          2089
موقع        1836
بسته        1818
نبود        1788
اسنپ        1637
سیب         1612
سس          1573
سرد         1571
بندی        1482
نان         1434
ممنونم      1411
مرغ         1410
راضی        1386
تحویل       1349
خوش         1328
داغ         1311
قیمت        1306
حجم         1294
دستم        1254
مزه         1222
تشکر        1217
کل          1204
نسبت        1183
فود         1171
زیاد        1171
…           1161
چیز         1130
بهتر        1125
رستوران     1105
زمان        1096
کمی 

In [83]:
drop_duplicated = all_words.drop_duplicates()
print(drop_duplicated)


0            'واقعا'
0              'حیف'
0              'وقت'
0               'که'
0           'بنویسم'
            ...     
69732     'دارم!!!؟'
69735     'عااااشقش'
69743    'تعجبه!!!!'
69743     'دورریختن'
69748      'چربی‌اش'
Name: words, Length: 46433, dtype: object
46433
