In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d moeinkpr/snappfood-comments

!unzip snappfood-comments.zip

!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

In [None]:
import cudf
from cuml.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
comments_table = cudf.read_csv("comments.csv")
vendors_table = cudf.read_csv("vendors.csv")

comments_table = comments_table.drop(columns=["commentId","date","sender","rating","customerId","feeling","status","foods","replies"], errors="ignore")
vendors_table = vendors_table.drop(columns=["id","highlight","description","address","rating","title"], errors="ignore")

merged_df = comments_table.merge(vendors_table, on="code", how="left")

merged_df = merged_df[merged_df["vendorType"] == "RESTAURANT"]
merged_df = merged_df.drop(columns=["vendorType"])

merged_df["deliveryComment"] = merged_df["deliveryComment"].fillna("")
merged_df["commentText"] = merged_df["commentText"].fillna("")

merged_df["text_raw"] = merged_df["commentText"] + " " + merged_df["deliveryComment"]
merged_df = merged_df.drop(columns=["commentText", "deliveryComment"])

merged_df["createdDate"] = cudf.to_datetime(merged_df["createdDate"])
merged_df["week"] = merged_df["createdDate"].dt.isocalendar().week.astype("int32")
merged_df["year"] = merged_df["createdDate"].dt.year.astype("int32")

le = LabelEncoder()
merged_df["expeditionType_encoded"] = le.fit_transform(merged_df["expeditionType"].astype("str"))
merged_df = merged_df.drop(columns=["expeditionType"])

merged_df = merged_df.dropna(subset=["rate"])

merged_df["rate"] = merged_df["rate"].astype("uint8")


def auto_data_type(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        min_value = df[col].min()
        max_value = df[col].max()

        if pd.api.types.is_float_dtype(df[col]):
            df[col] = df[col].astype(np.float32)
        elif min_value >= 0:
            if max_value <= 255:
                df[col] = df[col].astype(np.uint8)
            elif max_value <= 65535:
                df[col] = df[col].astype(np.uint16)
            elif max_value <= 4294967295:
                df[col] = df[col].astype(np.uint32)
            else:
                df[col] = df[col].astype(np.uint64)
        else:
            if -128 <= min_value and max_value <= 127:
                df[col] = df[col].astype(np.int8)
            elif -32768 <= min_value and max_value <= 32767:
                df[col] = df[col].astype(np.int16)
            elif -2147483648 <= min_value and max_value <= 2147483647:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
    return df


merged_df=auto_data_type(merged_df)

merged_df.info()

<class 'cudf.core.dataframe.DataFrame'>
Index: 1078372 entries, 0 to 1290548
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype
---  ------                  --------------    -----
 0   createdDate             1078372 non-null  datetime64[ns]
 1   rate                    1078372 non-null  uint8
 2   code                    1078372 non-null  object
 3   commentCount            1078372 non-null  object
 4   text_raw                1078372 non-null  object
 5   week                    1078372 non-null  uint8
 6   year                    1078372 non-null  uint16
 7   expeditionType_encoded  1078084 non-null  uint8
dtypes: datetime64[ns](1), object(3), uint16(1), uint8(3)
memory usage: 151.6+ MB


In [None]:
import cudf
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from tqdm import tqdm


def build_weekly_data(merged_df):
   
      merged_df = merged_df.to_pandas()


    g = merged_df.groupby(["code", "year", "week"])
    weekly = g.agg(mean_rate=("rate", "mean"), ).reset_index()

    weekly = weekly.sort_values(["code", "year", "week"])


    for lag in [1, 2, 3, 4]:
        weekly[f"rate_lag{lag}"] = weekly.groupby("code")["mean_rate"].shift(lag)


    weekly["delta_rate"] = weekly["mean_rate"] - weekly["rate_lag1"]
    weekly["delta2_rate"] = weekly["rate_lag1"] - weekly["rate_lag2"]
    weekly["acceleration_rate"] = weekly["delta_rate"] - weekly["delta2_rate"]

    weekly["rolling_mean_2w"] = (weekly.groupby("code")["mean_rate"].transform(lambda x: x.rolling(2, min_periods=1).mean()))

    weekly["rolling_mean_3w"] = (weekly.groupby("code")["mean_rate"].transform(lambda x: x.rolling(3, min_periods=1).mean()))

    weekly["rolling_mean_4w"] = (weekly.groupby("code")["mean_rate"].transform(lambda x: x.rolling(4, min_periods=1).mean()))

    weekly["y_next"] = weekly.groupby("code")["mean_rate"].shift(-1)

    weekly = weekly.dropna().reset_index(drop=True)

    return weekly


In [None]:
def train_forecast_model(weekly):

        weekly = weekly.to_pandas()

    features = [c for c in weekly.columns if c not in ["code", "year", "week", "y_next"]]
    X = weekly[features]
    y = weekly["y_next"]

    X_train, y_train = X, y
    X_test, y_test = X, y


    model = XGBRegressor(
        n_estimators=2000,
        max_depth=7,
        learning_rate=0.01,
        subsample=0.8,
       device="cuda",
        random_state=42,
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    return model, features


