In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# Load data from folder
def get_data(folder_path):
    os.chdir(folder_path)
    df = pd.DataFrame()

    for file in os.listdir(folder_path):
        file_path = f"{folder_path}/{file}"
        print(f'reading {file_path}')
        data=pd.read_csv(file_path, encoding="ISO-8859-1")
        data = data.dropna()
        df = pd.concat([df, data])

    return df

current_path = os.getcwd()
folder_path = current_path + "/dirty"
df = get_data(folder_path)
df.shape

In [None]:
# Visualise data
df.groupby(['category']).size().plot.bar()

In [None]:
# Preprocessing
def preprocess_file(file_path):
    if os.path.exists(file_path):    
        print("File exists")
        os.remove(file_path)
        print(f'removed {file_path}')
        return True
    else:
        print("Given file doesn't exist")
        return False

def preprocess_folder(folder_path):
    if os.path.isdir(folder_path):
        if not os.listdir(folder_path):
            print("Directory is empty")
            return True
        else:    
            print("Directory is not empty")
            for file in os.listdir(folder_path):
                file_path = f"{folder_path}/{file}"
                os.remove(file_path)
                #print(f'removed {file_path}')
            return True
    else:
        print("Given directory doesn't exist")
        return False

In [None]:
# Create answer and test files
def create_evaluation_files(df_val):
    current_path = os.getcwd()
    ans_file_name = "/../evaluation/ans.csv"
    test_file_name = "/../evaluation/test.csv"
    ans_file_path = current_path + ans_file_name
    test_file_path = current_path + test_file_name
    preprocess_file(ans_file_path)
    preprocess_file(test_file_path)

    df_val["index"] = np.arange(len(df_val))
    df_val = df_val.set_index('index')
    df_val.to_csv(ans_file_path, encoding="ISO-8859-1")
    print(f'created {ans_file_path}')
    # print(df_val.head(n=10))

    df_val['category'] = df_val['category'].replace(['REAL'], ['FAKE'])
    df_val.to_csv(test_file_path, encoding="ISO-8859-1")
    print(f'created {test_file_path}')

In [None]:
# Organise remaining data into files of 5000 rows each
def create_data_files(df_full):
    current_path = os.getcwd()
    file_name = "/../data"
    folder_path = current_path + file_name
    preprocess_folder(folder_path)

    df_full["index"] = np.arange(len(df_full))
    df_full = df_full.set_index('index')
    # print(df_full.head(n=10))

    size = 5000
    list_of_dfs = [df_full.loc[i:i+size-1,:] for i in range(0, len(df_full),size)]
    for i, df in enumerate(list_of_dfs, 1):
        filename = "data_{}.csv".format(i)
        file_path = os.path.join(folder_path, filename)
        df.to_csv(file_path, encoding="ISO-8859-1")
        print(f'{df.shape} {file_path}')

In [None]:
# Split data into training and evaluation set with the proportion of 90:10
np.random.seed(112)
df_full, df_val = np.split(df.sample(frac=1, random_state=42), [int(.9*len(df))])
print(f'full {df_full.shape}, eval {df_val.shape}')

create_evaluation_files(df_val)
create_data_files(df_full)