# Splitter

In order to label the data, we shuffle the comments form different videos and select only 10000 comment to be labeled. Then, since we have 5 team member, we divide the balanced dataset into 5 parts of 2000 comments.


In [1]:
import pandas as pd

## Concat the files from different playlist

In [2]:
# We concatenate the files for each sentiment

def concat_files(file_1, file_2):

    # Load the data
    df1 = pd.read_csv(file_1)
    df2 = pd.read_csv(file_2)

    # Concatenate the dataframes
    df_concat = pd.concat([df1, df2], ignore_index=True)
    return df_concat

# Concat comment from positive playlist
df_pos = concat_files('1_POSITIVE_ENGLISH_cleaned_and_filtered_comments_helper.csv',
                      '2_POSITIVE_ENGLISH_cleaned_and_filtered_comments_helper.csv')

# Concat comment from negative playlist
df_neg = concat_files('1_NEGATIVE_ENGLISH_cleaned_and_filtered_comments_helper.csv',
                      '2_NEGATIVE_ENGLISH_cleaned_and_filtered_comments_helper.csv')

# Concat comment from neutral playlist
df_neu = concat_files('1_NEUTRAL_ENGLISH_cleaned_and_filtered_comments_helper.csv',
                      '2_NEUTRAL_ENGLISH_cleaned_and_filtered_comments_helper.csv')


# Create the dataset

In [3]:
# We create a balanced dataset with 10,000 samples (3,333 for each category)

def create_balanced_csv(df_pos, df_neg, df_neu, output_file, total_size):

    # Print original size of the dataframes
    print('Input Shape')
    print('Pos: ', df_pos.shape)
    print('Neg: ', df_neg.shape)
    print('Neu: ', df_neu.shape)


    # Calculate the size for each category
    size = total_size // 3

    # Randomly sample from each dataframe
    df_pos_sample = df_pos.sample(n=min(size, len(df_pos)), random_state=1)
    df_neg_sample = df_neg.sample(n=min(size, len(df_neg)), random_state=1)
    df_neu_sample = df_neu.sample(n=min(size, len(df_neu)), random_state=1)

    # If total size is not reached, add more samples from each category
    while len(df_pos_sample) + len(df_neg_sample) + len(df_neu_sample) < total_size:
        remaining = total_size - len(df_pos_sample) - len(df_neg_sample) - len(df_neu_sample)
        to_add = min(remaining, size)
        if len(df_pos) > len(df_pos_sample):
            df_pos_sample = pd.concat([df_pos_sample, df_pos.loc[~df_pos.index.isin(df_pos_sample.index)].sample(n=to_add, random_state=1)])
        elif len(df_neg) > len(df_neg_sample):
            df_neg_sample = pd.concat([df_neg_sample, df_neg.loc[~df_neg.index.isin(df_neg_sample.index)].sample(n=to_add, random_state=1)])
        else:
            df_neu_sample = pd.concat([df_neu_sample, df_neu.loc[~df_neu.index.isin(df_neu_sample.index)].sample(n=to_add, random_state=1)])

    # Concatenate the samples and shuffle
    df_final = pd.concat([df_pos_sample, df_neg_sample, df_neu_sample])
    df_final = df_final.sample(frac=1, random_state=1).reset_index(drop=True)

    # Blank column to label
    df_final["Label"] = pd.NA

    # Select only 'Dataset', 'Comment' and 'Label' columns
    df_final = df_final[['Comment', 'Label']]

    # Check the size
    print('')
    print('Output Shape')
    print('Bal: ', df_final.shape)  # Should print (10000, 2)

    # Save to csv
    df_final.to_csv(output_file, index=False)

# Call the function
create_balanced_csv(df_pos, df_neg, df_neu, 'balanced.csv', 10000)


Input Shape
Pos:  (3943, 2)
Neg:  (8468, 2)
Neu:  (8229, 2)

Output Shape
Bal:  (10000, 2)


## Divide the CSV in equal parts

In [4]:
# Since we have 5 team members we divide the balanced dataset into 5 parts

def divide_csv(input_file, num_parts):

    # Load the data
    df = pd.read_csv(input_file)

    # Calculate the size of each part
    part_size = len(df) // num_parts

    print('Input Shape')
    print('Df: ', df.shape)
    print('')
    print('Output Shape')

    # Divide the dataframe into parts and save each one
    for i in range(num_parts):
        start = i * part_size
        end = (i + 1) * part_size if i < num_parts - 1 else None  # Include remaining rows in the last part
        df_part = df[start:end]

        # Check the size
        print(f'Part_{i+1}:', df_part.shape)  # Should print (2000,2)

        # Download the csv
        df_part.to_csv(f'part_{i+1}.csv', index=False)

# Call the function
divide_csv('balanced.csv', 5)


Input Shape
Df:  (10000, 2)

Output Shape
Part_1: (2000, 2)
Part_2: (2000, 2)
Part_3: (2000, 2)
Part_4: (2000, 2)
Part_5: (2000, 2)
