# 1. Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# 2. Project Variables

In [None]:
INPUT_DATA_DIR = '../data/processed'
OUTPUT_DATA_DIR = '../train-test-data'
FEATURES_FILE = os.path.join(INPUT_DATA_DIR, 'Xs.csv')
LABELS_FILE = os.path.join(INPUT_DATA_DIR, 'Final_Tags.xlsx')

# 3. Split Training and Testing Files

In [None]:
X_df = pd.read_csv(FEATURES_FILE, index_col= 0)
labels_df = pd.read_excel(LABELS_FILE, sheet_name='FINAL TAGS', index_col=0)

tasks = labels_df.columns
features = X_df.columns

data_df = pd.merge(X_df, labels_df, left_index=True, right_index=True)
for task in tasks:
    print(f'Processing task {task}...')
    task_df = data_df.drop([t for t in tasks if t != task], axis=1)
    task_df.dropna(axis=0, inplace=True)
    
    X_train, X_test, y_train, y_test = train_test_split(
        task_df.drop(task, axis=1), 
        task_df[[task]], 
        test_size=0.5, 
        random_state=42,
        stratify=task_df[task]
    )

    train_df = pd.merge(X_train, y_train, left_index=True, right_index=True)
    test_df = pd.merge(X_test, y_test, left_index=True, right_index=True)

    train_df.to_csv(os.path.join(OUTPUT_DATA_DIR, f'{task}_train.csv'))
    test_df.to_csv(os.path.join(OUTPUT_DATA_DIR, f'{task}_test.csv'))
    
    print(f'Task {task} processed.')
    print()