In [1]:
import pandas as pd
import logging
import re
import os
import numpy as np
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

In [2]:
def read_json_file(data_path: str) -> pd.DataFrame:
    """
    This functions combine multiple CSV files into a single CSV file
    :param data_path: GCS path where all the cleaned preprocessed csv file saved
    :return: Dataframe
    """
    for filename in os.listdir(data_path):
        if filename.endswith(".log"):
            filepath = os.path.join(data_path, filename)
            data = pd.read_json(filepath, lines=True)
    return data

In [None]:
file = pd.read_json("predict_category/events.log", lines=True)

In [None]:
file.head()

In [None]:
len(file)

In [None]:
file.category.isna().sum()

In [None]:
file.loc[(file.name == "search")]

In [None]:
file.loc[(file["name"] == "search") & (pd.isna(file["title"]))]

In [None]:
file.loc[(file["name"] == "search") & (pd.notna(file["title"]))]

In [None]:
test_data = file.loc[(pd.isna(file["category"]))]

In [None]:
test_data

In [None]:
data = file.loc[(pd.notna(file["category"]))]

In [None]:
data.loc[data.name == "search"]

In [None]:
data

In [None]:
data["category"].value_counts().plot.bar();

In [None]:
data["category"].value_counts().plot.pie();

In [None]:
len(data["category"].unique())

In [None]:
perc = len(data["title"].apply(lambda x: str(x).casefold().strip()).unique()) / len(
    data
)
print(f"The percentage of the unique title is {round(perc*100, 1)}%")

In [None]:
nwords = data["title"].apply(lambda x: len(str(x).split()))
nwords.plot.hist();

In [None]:
data.head()

In [None]:
def split_test_and_training_data(df: pd.DataFrame):
    test_data = df.loc[(pd.isna(df["category"]))]
    data = df.loc[(pd.notna(df["category"]))]
    return test_data, data


def clean_labels(df: pd.DataFrame) -> pd.DataFrame:
    """
    This function cleans the labels for the training.
    :param df: training dataframe
    :return: a dataframe with lowered label column.
    """
    df[LABEL_COLUMN_NAME] = df[LABEL_COLUMN_NAME].replace("", np.nan)
    df = df.dropna(subset=[LABEL_COLUMN_NAME])
    df = df.reset_index(drop=True)
    logging.info(
        f"Missing labels dropped successfully: \
                    Data has a total of {len(df)} rows."
    )
    df[LABEL_COLUMN_NAME] = df[LABEL_COLUMN_NAME].str.lower()
    df[LABEL_COLUMN_NAME] = df[LABEL_COLUMN_NAME].str.strip()
    df.reset_index(drop=True, inplace=True)
    return df


def text_preprocessing(text: str) -> str:
    """
    This function cleans text for English language,
    as it applies several rules,
    e.g. remove all special characters, remove duplicates etc.
    :param text: text to be preprocessed
    :return: the preprocessed text
    """
    text = text.strip().casefold()
    text = re.sub(r"[-()?@.,;_#+*'‘{}%$§!<>/]", " ", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\r\n|\r|\n", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = text.rstrip()
    text = re.sub(r"[\n\t\ ]+", " ", text).split()
    text = list(dict.fromkeys(text))
    text = " ".join(text)
    return text


def apply_text_preprocessing(
    df: pd.DataFrame,
) -> pd.DataFrame:
    """
    This functions combines product_name and menu_category
    columns into a new column and perform
    text preprocessing on the new column.
    :param df: training dataframe
    :return: the dataframe with the cleaned
     preprocessed text int the new column
    """
    logging.info("Start combining product_name and menu_category columns... ")
    df[TRAINING_PRODUCT_NAME] = df[TRAINING_PRODUCT_NAME].replace(np.nan, "")
    df = df.dropna(subset=[TRAINING_PRODUCT_NAME, TRAINING_MENU_CATEGORY])
    df = df.reset_index(drop=True)
    df[FEATURE_COLUMN_NAME] = df[COMB_P_M].apply(text_preprocessing)
    logging.info(f"Data preprocessing finished successfully.")
    return df


def apply_preprocessing_training(
    df: pd.DataFrame,
) -> pd.DataFrame:

    """
    This function runs all required steps for
    the preprocessing of the product name and menu category
    :param df: training dataframe
    :return: a dataframe with preprocessed data.
    """

    cleaned_df = apply_text_preprocessing(df)
    logging.info("Start dropping empty cleaned combined product names values... ")
    cleaned_df[FEATURE_COLUMN_NAME] = cleaned_df[FEATURE_COLUMN_NAME].replace(
        "", np.nan
    )
    cleaned_df = cleaned_df.dropna(subset=[FEATURE_COLUMN_NAME])
    cleaned_df = cleaned_df.reset_index(drop=True)
    cleaned_df = cleaned_df.drop_duplicates(subset=FEATURE_COLUMN_NAME, keep="first")
    return cleaned_df