In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d moeinkpr/snappfood-comments

Dataset URL: https://www.kaggle.com/datasets/moeinkpr/snappfood-comments
License(s): MIT
Downloading snappfood-comments.zip to /content
  0% 0.00/98.1M [00:00<?, ?B/s]
100% 98.1M/98.1M [00:00<00:00, 1.71GB/s]


In [None]:
!unzip snappfood-comments.zip

Archive:  snappfood-comments.zip
  inflating: comments.csv            
  inflating: vendors.csv             


In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 603, done.[K
remote: Counting objects: 100% (169/169), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 603 (delta 131), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (603/603), 199.38 KiB | 1.59 MiB/s, done.
Resolving deltas: 100% (305/305), done.
Installing RAPIDS remaining 25.08 libraries
Using Python 3.12.11 environment at: /usr
Resolved 180 packages in 1.60s
Prepared 41 packages in 1m 01s
Uninstalled 31 packages in 950ms
Installed 41 packages in 453ms
 - bokeh==3.7.3
 + bokeh==3.6.3
 + cucim-cu12==25.8.0
 + cuda-bindings==12.9.2
 + cuda-pathfinder==1.3.0
 - cuda-python==12.6.2.post1
 + cuda-python==12.9.2
 - cudf-cu12==25.6.0 (from https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.6.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl)
 + cudf-cu12==25.8.0
 + cugraph-cu12==25.8.0
 - cuml-cu12==25.6.0
 + cuml-cu12==25.8.0
 - cuvs-cu12==25.6.1
 + cuvs-cu12==

In [None]:
!pip uninstall -y hazm
!pip install numpy==1.26.4
!pip install hazm --no-deps
!pip install python-crfsuite fasttext-wheel gensim nltk flashtext

Found existing installation: hazm 0.10.0
Uninstalling hazm-0.10.0:
  Successfully uninstalled hazm-0.10.0
Collecting hazm
  Using cached hazm-0.10.0-py3-none-any.whl.metadata (11 kB)
Using cached hazm-0.10.0-py3-none-any.whl (892 kB)
Installing collected packages: hazm
Successfully installed hazm-0.10.0
Collecting python-crfsuite
  Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting fasttext-wheel
  Downloading fasttext_wheel-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting flashtext
  Downloading flashtext-2.7.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext-wheel)
  Downloading pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-

In [None]:
import cudf
from hazm import Normalizer, word_tokenize, Stemmer, Lemmatizer, stopwords_list
import re
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd


In [None]:
comments_table=cudf.read_csv("comments.csv")
vendors_table=cudf.read_csv("vendors.csv")

comments_table_copy=comments_table.drop(columns=["commentId","date","sender","rating","customerId","feeling","status","foods","replies"])
comments_table_copy
vendors_table_copy=vendors_table.drop(columns=["id","highlight","description","address","rating","title"])
vendors_table_copy

Unnamed: 0,code,commentCount,vendorType
0,pzlkj3,24112,RESTAURANT
1,0lvonp,10988,RESTAURANT
2,3drlyx,5815,RESTAURANT
3,klw59g,12,RESTAURANT
4,32m1rl,13542,RESTAURANT
...,...,...,...
2250,3x25xw,7391,RESTAURANT
2251,0rel4o,46,RESTAURANT
2252,0y16yg,22278,RESTAURANT
2253,znv94e,66,RESTAURANT


In [None]:
merged_df = comments_table_copy.merge(vendors_table_copy,on="code",how="left")
merged_df = merged_df[merged_df["vendorType"] == "RESTAURANT"]

merged_df=merged_df.drop(columns=["vendorType"])
merged_df["deliveryComment"].fillna("",inplace=True)


In [None]:
merged_df['text_raw'] = (merged_df['commentText'].fillna('') + ' ' + merged_df['deliveryComment'].fillna(''))
merged_df = merged_df.drop(columns=['commentText', 'deliveryComment'])


In [None]:
merged_df['createdDate'] = cudf.to_datetime(merged_df['createdDate'])
merged_df['week'] = merged_df['createdDate'].dt.isocalendar().week
merged_df['year'] = merged_df['createdDate'].dt.year

In [None]:
merged_df.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 1078372 entries, 0 to 1290547
Data columns (total 8 columns):
 #   Column          Non-Null Count    Dtype
---  ------          --------------    -----
 0   createdDate     1078372 non-null  datetime64[ns]
 1   rate            1078372 non-null  float64
 2   expeditionType  1078084 non-null  object
 3   code            1078372 non-null  object
 4   commentCount    1078372 non-null  object
 5   text_raw        1078372 non-null  object
 6   week            1078372 non-null  uint32
 7   year            1078372 non-null  int16
dtypes: datetime64[ns](1), float64(1), int16(1), object(4), uint32(1)
memory usage: 174.1+ MB


In [None]:
le = LabelEncoder()
expedition_col = merged_df['expeditionType'].to_pandas()

encoded = le.fit_transform(expedition_col.astype(str))

merged_df['expeditionType_encoded'] = cudf.Series(encoded)

merged_df = merged_df.dropna(subset=['rate'])
merged_df['rate'] = merged_df['rate'].astype('uint8')
merged_df = merged_df.drop(columns=['expeditionType'])

In [None]:
merged_df.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 1078372 entries, 0 to 1290547
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype
---  ------                  --------------    -----
 0   createdDate             1078372 non-null  datetime64[ns]
 1   rate                    1078372 non-null  uint8
 2   code                    1078372 non-null  object
 3   commentCount            1078372 non-null  object
 4   text_raw                1078372 non-null  object
 5   week                    1078372 non-null  uint32
 6   year                    1078372 non-null  int16
 7   expeditionType_encoded  895700 non-null   int64
dtypes: datetime64[ns](1), int16(1), int64(1), object(3), uint32(1), uint8(1)
memory usage: 161.9+ MB


In [None]:
def auto_data_type(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        min_value = df[col].min()
        max_value = df[col].max()

        if pd.api.types.is_float_dtype(df[col]):
            df[col] = df[col].astype(np.float32)
        elif min_value >= 0:
            if max_value <= 255:
                df[col] = df[col].astype(np.uint8)
            elif max_value <= 65535:
                df[col] = df[col].astype(np.uint16)
            elif max_value <= 4294967295:
                df[col] = df[col].astype(np.uint32)
            else:
                df[col] = df[col].astype(np.uint64)
        else:
            if -128 <= min_value and max_value <= 127:
                df[col] = df[col].astype(np.int8)
            elif -32768 <= min_value and max_value <= 32767:
                df[col] = df[col].astype(np.int16)
            elif -2147483648 <= min_value and max_value <= 2147483647:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
    return df


merged_df=auto_data_type(merged_df)

In [None]:
merged_df.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 1078372 entries, 0 to 1290547
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype
---  ------                  --------------    -----
 0   createdDate             1078372 non-null  datetime64[ns]
 1   rate                    1078372 non-null  uint8
 2   code                    1078372 non-null  object
 3   commentCount            1078372 non-null  object
 4   text_raw                1078372 non-null  object
 5   week                    1078372 non-null  uint8
 6   year                    1078372 non-null  uint16
 7   expeditionType_encoded  895700 non-null   uint8
dtypes: datetime64[ns](1), object(3), uint16(1), uint8(3)
memory usage: 151.6+ MB


In [None]:
!kaggle datasets download -d alioraji/persian-stop-words
!unzip persian-stop-words.zip
with open("Persian_Stop_Words.txt", encoding='utf-8') as f:
    custom_stopwords = {line.strip() for line in f if line.strip()}

Dataset URL: https://www.kaggle.com/datasets/alioraji/persian-stop-words
License(s): other
Downloading persian-stop-words.zip to /content
  0% 0.00/13.3k [00:00<?, ?B/s]
100% 13.3k/13.3k [00:00<00:00, 58.0MB/s]
Archive:  persian-stop-words.zip
  inflating: Persian_Stop_Words.txt  


In [None]:
normalizer = Normalizer()
lemmatizer = Lemmatizer()

def preprocess_text(text, stopwords):
    text = normalizer.normalize(text)
    text = re.sub(r'[^\u0600-\u06FF\s]', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords and len(t) > 2]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)
df = merged_df.to_pandas()

df['text_clean'] = df['text_raw'].fillna('').apply(lambda x: preprocess_text(x, custom_stopwords))


In [None]:
merged_df = cudf.from_pandas(df)
merged_df

Unnamed: 0,createdDate,rate,code,commentCount,text_raw,week,year,expeditionType_encoded,text_clean,flag_quality,flag_delivery,flag_price,flag_packaging,flag_amount,flag_order_error
0,2024-10-17 19:13:44,10,095gkq,1051,پیک مودب وساندویچ خوش طعم بود سپاس ازبودنتون,42,2024,4.0,پیک مودب وساندویچ طعم سپاس ازبودنتون,1,1,0,0,0,0
1,2024-10-14 10:49:25,2,095gkq,1051,کوچکترین پیاز، خیار، لبو و گنده ترین سیب زمینی...,42,2024,4.0,کوچکترین پیاز خیار لبو گنده سیب زمینی هندوانه ...,0,0,0,0,0,0
2,2024-10-08 11:38:58,2,095gkq,1051,هندوانه کاملا خراب بود و دور ریخته شد. بجای دو...,41,2024,4.0,هندوانه خراب ریخت#ریز کیلو اسفناچ فرستاد#فرست,0,0,0,0,0,0
3,2024-10-01 20:03:59,2,095gkq,1051,هندوانه گندیده و دور ریخته شد شلیل ها تمام له ...,40,2024,4.0,هندوانه گندیده ریخت#ریز شلیل,0,0,0,0,0,0
4,2024-08-04 11:55:19,10,095gkq,1051,همه چی عالی,31,2024,4.0,,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1290543,2024-12-09 20:03:55,10,zn8ewj,33,سلام من اولین باره خرید می‌کنم هنوز نمی‌تونم ن...,50,2024,,سلام خرید تون نظری عسل نظرمو عسل پوکید#پوک است...,0,0,0,0,0,0
1290544,2024-12-09 15:35:37,10,zn8ewj,33,کیفیت خوب و مرغوب,50,2024,,کیفیت مرغوب,1,0,0,0,0,0
1290545,2024-10-23 12:56:57,10,zn8ewj,33,عالی,43,2024,,,0,0,0,0,0,0
1290546,2024-10-20 16:41:07,10,zn8ewj,33,بسیار عالی,42,2024,,,0,0,0,0,0,0
