In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from ipynb.fs.full.constants import RANDOM_STATE

In [1]:
def load_all_data(drop_duplicates=True):
    data_dir = "data"

    psy = pd.read_csv(f"{data_dir}/Youtube01-Psy.csv", parse_dates=["DATE"])
    katy = pd.read_csv(f"{data_dir}/Youtube02-KatyPerry.csv", parse_dates=["DATE"])
    lmfao = pd.read_csv(f"{data_dir}/Youtube03-LMFAO.csv", parse_dates=["DATE"])
    eminem = pd.read_csv(f"{data_dir}/Youtube04-Eminem.csv", parse_dates=["DATE"])
    shakira = pd.read_csv(f"{data_dir}/Youtube05-Shakira.csv", parse_dates=["DATE"])
    
    all_datasets = [psy, katy, lmfao, eminem, shakira]
    dataset_names = ["psy", "katy", "lmfao", "eminem", "shakira"]

    # keep info about which video the comment appeared in
    for dataset_name, dataset in zip(dataset_names, all_datasets):
        dataset["INTERPRET"] = dataset_name

    # join all datasets
    joined = pd.concat(all_datasets).reset_index(drop=True)
    
    # common preprocessing
    if drop_duplicates:
        joined.drop_duplicates(inplace=True)
    
    # convert object types to strings
    object_cols = joined.select_dtypes("object").columns
    joined[object_cols] = joined[object_cols].astype("string")
    
    return joined

In [2]:
def load_data():
    all_data = load_all_data()
    
    df, final_test_df = train_test_split(
        all_data, test_size=0.2, random_state=RANDOM_STATE
    )
    
    return df

In [58]:
def load_final_test_data():
    all_data = load_all_data()
    
    df, final_test_df = train_test_split(
        all_data, test_size=0.2, random_state=RANDOM_STATE
    )
    
    return final_test_df

In [3]:
def load_train_test_all_cols_data(test_size=0.2):
    df = load_data()
    df_X, df_y = df.drop(columns="CLASS"), df.CLASS
    
    return train_test_split(
        df_X, df_y, test_size=test_size, random_state=RANDOM_STATE
    )

In [60]:
def load_train_test_data():
    df = load_data()
    df_X, df_y = df.CONTENT, df.CLASS
    
    return train_test_split(
        df_X, df_y, test_size=0.2, random_state=RANDOM_STATE
    )

In [61]:
assert load_all_data().shape == (1953, 6)

In [62]:
assert load_data().shape == (1562, 6)

In [63]:
assert load_final_test_data().shape == (391, 6)