In [1]:
import pandas as pd
import numpy as np
import random

In [43]:
def balance_positive_negative_length(df_positive, df_negative, larger):

    df_larger = pd.DataFrame(df_positive['text_length']) if larger == 'positive' else pd.DataFrame(df_negative['text_length'])

    df_smaller = pd.DataFrame(df_negative['text_length']) if larger == 'positive' else pd.DataFrame(df_positive['text_length'])

    df_larger['original_index'] = range(len(df_larger))
    df_smaller['original_index'] = range(len(df_smaller))

    sampled_indices = []

    for length in df_smaller['text_length']:
        # Find indices in negative class with text length closest to 'pos_length'
        closest_idx = (np.abs(df_larger['text_length'] - length)).argmin()
        original_idx = df_larger.iloc[closest_idx]['original_index']
        sampled_indices.append(original_idx)
        # Drop the selected index to avoid sampling it again
        df_larger = df_larger.drop(closest_idx)
        df_larger = df_larger.reset_index(drop=True)

    # Filter negative class DataFrame based on sampled indices
    if larger == 'positive':
        balanced_df = df_positive.iloc[sampled_indices]
    else:
        balanced_df = df_negative.iloc[sampled_indices]

    return balanced_df

In [47]:
path = '../Data/new_humor_datasets/original/'

### Balance Yelp Reviews (drop negatives)

In [None]:
yelp_path = path + 'yelp_reviews/'
df_yelp_funny = pd.read_csv(yelp_path + 'all_funny_reviews_over_5.csv')
df_yelp_unfunny = pd.read_csv(yelp_path + 'unfunny_reviews_all.csv')

In [24]:
# choose only samples with funny score >= 10 and length <= 1024
df_yelp_funny = df_yelp_funny[df_yelp_funny['funny'] >= 10]
df_yelp_funny = df_yelp_funny[df_yelp_funny['text'].apply(lambda s: len(s) <= 1024)]

In [25]:
# filter also unfunny df to samples with length <= 1024
df_yelp_unfunny = df_yelp_unfunny[df_yelp_unfunny['text'].apply(lambda s: len(s) <= 1024)]

In [26]:
# reset indices
df_yelp_funny = df_yelp_funny.reset_index(drop=True)
df_yelp_unfunny = df_yelp_unfunny.reset_index(drop=True)

In [27]:
df_yelp_funny['text_length'] = df_yelp_funny['text'].apply(lambda s: len(s))
df_yelp_unfunny['text_length'] = df_yelp_unfunny['text'].apply(lambda s: len(s))

In [28]:
df_yelp_funny['text_length'].describe()

count    9708.000000
mean      623.351051
std       252.115496
min         1.000000
25%       424.000000
50%       649.000000
75%       840.000000
max      1024.000000
Name: text_length, dtype: float64

In [29]:
df_yelp_unfunny['text_length'].describe()

count    230647.000000
mean        389.647873
std         233.568398
min           1.000000
25%         201.000000
50%         334.000000
75%         535.000000
max        1024.000000
Name: text_length, dtype: float64

In [44]:
balanced_df_yelp_unfunny = balance_positive_negative_length(df_yelp_funny, df_yelp_unfunny, larger='negative')
balanced_df_yelp_unfunny = balanced_df_yelp_unfunny.sample(frac=1, random_state=0, ignore_index=True)

In [45]:
balanced_df_yelp_unfunny['text_length'].describe()

count    9708.000000
mean      623.351154
std       252.115242
min         1.000000
25%       424.000000
50%       649.000000
75%       840.000000
max      1024.000000
Name: text_length, dtype: float64

In [41]:
df_yelp = pd.concat([df_yelp_funny, balanced_df_yelp_unfunny], ignore_index=True)
df_yelp['id'] = range(len(df_yelp))
df_yelp['label'] = df_yelp['funny'].apply(lambda funny_score: 1 if funny_score > 0 else 0)
df_yelp = df_yelp.sample(frac=1, random_state=0, ignore_index=True)
df_yelp.to_csv(yelp_path + 'data.csv', index=False)

### Balance Reddit Dad Jokes (drop positives)

In [49]:
dadjokes_path = path + 'reddit_dadjokes/'
df_dadjokes_funny = pd.read_csv(dadjokes_path + 'reddit_dadjokes.csv')
df_dadjokes_unfunny = pd.read_csv(dadjokes_path + 'reddit_dadjokes_not_jokes.csv')

In [50]:
# filter jokes by score >= 20
df_dadjokes_funny = df_dadjokes_funny[df_dadjokes_funny['score'] >= 20]

In [51]:
# reset indices
df_dadjokes_funny = df_dadjokes_funny.reset_index(drop=True)
df_dadjokes_unfunny = df_dadjokes_unfunny.reset_index(drop=True)

In [52]:
df_dadjokes_funny['text_length'] = df_dadjokes_funny['joke'].apply(lambda s: len(s))
df_dadjokes_unfunny['text_length'] = df_dadjokes_unfunny['edited_joke'].apply(lambda s: len(s))

In [53]:
df_dadjokes_funny['text_length'].describe()

count    23068.000000
mean       136.991677
std        260.883307
min         12.000000
25%         65.000000
50%         90.000000
75%        142.000000
max      24538.000000
Name: text_length, dtype: float64

In [54]:
df_dadjokes_unfunny['text_length'].describe()

count    11024.000000
mean        85.919176
std         25.980298
min          5.000000
25%         69.000000
50%         82.000000
75%         99.000000
max        405.000000
Name: text_length, dtype: float64

In [55]:
balanced_df_dadjokes_funny = balance_positive_negative_length(df_dadjokes_funny, df_dadjokes_unfunny, larger='positive')
balanced_df_dadjokes_funny = balanced_df_dadjokes_funny.sample(frac=1, random_state=0, ignore_index=True)

In [58]:
balanced_df_dadjokes_funny['text_length'].describe()

count    11024.000000
mean        85.919358
std         25.978353
min         12.000000
25%         69.000000
50%         82.000000
75%         99.000000
max        405.000000
Name: text_length, dtype: float64

In [60]:
df_dadjokes_unfunny.drop(['joke', 'id', 'num_tokens'], axis=1, inplace=True)

In [61]:
df_dadjokes_unfunny = df_dadjokes_unfunny.rename(columns={'edited_joke': 'text'})
balanced_df_dadjokes_funny = balanced_df_dadjokes_funny.rename(columns={'joke': 'text'})
df_dadjokes_unfunny = df_dadjokes_unfunny[['author', 'url', 'date', 'text', 'text_length', 'score']]
balanced_df_dadjokes_funny = balanced_df_dadjokes_funny[['author', 'url', 'date', 'text', 'text_length', 'score']]

In [62]:
df_dadjokes = pd.concat([balanced_df_dadjokes_funny, df_dadjokes_unfunny], ignore_index=True)
df_dadjokes['id'] = range(len(df_dadjokes))
df_dadjokes['label'] = df_dadjokes['score'].apply(lambda funny_score: 1 if funny_score >= 20 else 0)
df_dadjokes = df_dadjokes.sample(frac=1, random_state=0, ignore_index=True)
df_dadjokes.to_csv(dadjokes_path + 'data.csv', index=False)