If use Colab run cells markdown required by Colab

In [None]:
# Colab required
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Colab required
# change direction to Early_Baseline_PatentMatch_Paragraph_Classification_Mateusz repo folder
%cd path_to_repo

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import json

# Load the training and testing datasets from parquet files.
# Parquet is a columnar storage file format optimized for use with large datasets.
train_df = pd.read_parquet('./data/train.parquet')
test_df = pd.read_parquet('./data/test.parquet')

# Select only the relevant columns from the datasets.
# We are interested in 'text', 'text_b', and 'label' columns.
train_df = train_df[['text', 'text_b', 'label']]
test_df = test_df[['text', 'text_b', 'label']]

In [None]:
# Filter out rows where 'text' and 'text_b' are not strings or 'label' is not 0 or 1.
# This ensures data integrity and that the data types are as expected.
train_df_clean = train_df[
    train_df['text'].apply(lambda x: isinstance(x, str)) &
    train_df['text_b'].apply(lambda x: isinstance(x, str)) &
    train_df['label'].isin([0, 1])
]
test_df_clean = test_df[
    test_df['text'].apply(lambda x: isinstance(x, str)) &
    test_df['text_b'].apply(lambda x: isinstance(x, str)) &
    test_df['label'].isin([0, 1])
]

In [None]:
# Print the number of records removed due to invalid data types.
print(f'From train data removed {len(train_df) - len(train_df_clean)} invalid type records')
print(f'From test data removed {len(test_df) - len(test_df_clean)} invalid type records')

In [None]:
# Add a 'source' column to each dataset to indicate whether the data is from training or testing.
train_df_clean['source'] = 'train'
test_df_clean['source'] = 'test'

# Combine the filtered training and testing datasets into one DataFrame.
combined_df = pd.concat([train_df_clean, test_df_clean])

# Identify and handle duplicate data.
# Create a temporary column 'sorted_pair' that contains a sorted tuple of 'text' and 'text_b'.
# This helps in identifying duplicates regardless of the order of text pairs.
combined_df['sorted_pair'] = combined_df.apply(
    lambda row: tuple(sorted([row['text'], row['text_b']])), axis=1
)

# Group the data by 'sorted_pair' and filter out groups with more than one element.
# These are potential duplicates.
duplicate_groups = combined_df.groupby('sorted_pair').filter(lambda group: len(group) > 1)

# Find records with contradictory labels within these duplicate groups.
contradictory_labels = duplicate_groups.groupby('sorted_pair').filter(
    lambda group: group['label'].nunique() > 1
)
print(f'Found {len(contradictory_labels)} duplicated records with contradictory labels')

# Identify duplicate records that appear in both training and testing datasets.
duplicates_across_sets = combined_df.groupby('sorted_pair')['source'].unique()
duplicates_across_sets = duplicates_across_sets[duplicates_across_sets.apply(lambda x: len(x) > 1)]
print(f'Found {len(duplicates_across_sets)} duplicate records between the training and test sets')

# Remove duplicates within the training and testing datasets separately.
# For the training dataset:
train_df_clean['sorted_pair'] = train_df_clean.apply(
    lambda row: tuple(sorted([row['text'], row['text_b']])), axis=1
)
train_df_clean = train_df_clean.drop_duplicates(subset='sorted_pair', keep='first')
print(f'Removed duplicates from the training dataset.')

# For the testing dataset:
test_df_clean['sorted_pair'] = test_df_clean.apply(
    lambda row: tuple(sorted([row['text'], row['text_b']])), axis=1
)
test_df_clean = test_df_clean.drop_duplicates(subset='sorted_pair', keep='first')
print(f'Removed duplicates from the testing dataset.')

# Convert the datasets to a list of tuples in the format (text, text_b, label).
train_dataset = list(train_df_clean[['text', 'text_b', 'label']].itertuples(index=False, name=None))
test_dataset = list(test_df_clean[['text', 'text_b', 'label']].itertuples(index=False, name=None))

print(f'Final len of train dataset: {len(train_dataset)}')
print(f'Final len of test dataset: {len(test_dataset)}')

# Save the datasets to JSON files.
with open('./data/train_dataset.json', 'w') as file:
    json.dump(train_dataset, file)

with open('./data/test_dataset.json', 'w') as file:
    json.dump(test_dataset, file)