# Process datasets for CS 236 Final Project

**Shreyas Lakhtakia** | `shreyasl@stanford.edu`

In [25]:
!pip install torch datasets --quiet

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [26]:
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [27]:
SEED = 236

In [28]:
MAX_CHAR_LENGTH = 10000 #~3000*3.33 (max length context for llama is 4096, accommodating something well below that)

In [29]:
np.random.seed(SEED)

### Part (1): MedQUAD

In this part, we:
1. Procure the MedQUAD dataset
2. Format the data
3. Split it into 60/20/20 samples for: training, evaluation, testing and save results

### Get the dataset

In [30]:
pubmed_qa = load_dataset('pubmed_qa', 'pqa_labeled') # human labeled rows in the data

KeyboardInterrupt: ignored

In [None]:
pubmed_qa_df = pd.DataFrame(pubmed_qa['train']) # full dataset
pubmed_qa_df.head()

### Preprocessing and cleaning

1. Drop rows with missing questions or answers (there are none)
2. Drop responses that have a "maybe" response
3. [optional] Drop questions/answers that are too long and don't fit the context length

#### No empty rows

In [None]:
pubmed_qa_df[pubmed_qa_df.question.isnull()]

In [None]:
pubmed_qa_df[pubmed_qa_df.final_decision.isnull()]

#### Map data columns to the format expected

In [None]:
pubmed_qa_df = pubmed_qa_df.rename(columns={'question': 'instruction', 'final_decision': 'output'})
pubmed_qa_df.sample(5)

#### Drop the *maybe* responses

In [None]:
pubmed_qa_df.groupby('output').count()

In [None]:
pubmed_qa_df = pubmed_qa_df[pubmed_qa_df['output'] != 'maybe']
pubmed_qa_df.groupby('output').count()

### Visualize the data

In [None]:
# Calculating the length of each cell in each column
viz_med_quad_df = pubmed_qa_df
viz_med_quad_df['num_characters_instruction'] = viz_med_quad_df['instruction'].apply(lambda x: len(x))
# viz_med_quad_df['num_characters_input'] = viz_med_quad_df['input'].apply(lambda x: len(x))
viz_med_quad_df['num_characters_output'] = viz_med_quad_df['output'].apply(lambda x: len(x))

# Show Distribution
# viz_med_quad_df.hist(column=['num_characters_instruction', 'num_characters_input', 'num_characters_output'])
viz_med_quad_df.hist(column=['num_characters_instruction'])
viz_med_quad_df.hist(column=['num_characters_output'])

# Calculating the average
average_chars_instruction = viz_med_quad_df['num_characters_instruction'].mean()
# average_chars_input = viz_med_quad_df['num_characters_input'].mean()
average_chars_output = viz_med_quad_df['num_characters_output'].mean()

print(f'Average number of tokens in the instruction column: {(average_chars_instruction / 3):.0f}')
# print(f'Average number of tokens in the input column: {(average_chars_input / 3):.0f}')
print(f'Average number of tokens in the output column: {(average_chars_output / 3):.0f}', end="\n\n")

#### Drop examples with length over 1000 characters

In [None]:
print(viz_med_quad_df.shape)
viz_med_quad_df = viz_med_quad_df[viz_med_quad_df['num_characters_instruction'] < MAX_CHAR_LENGTH]
viz_med_quad_df = viz_med_quad_df[viz_med_quad_df['num_characters_output'] < MAX_CHAR_LENGTH]
print(viz_med_quad_df.shape)

viz_med_quad_df.hist(column=['num_characters_instruction'])
viz_med_quad_df.hist(column=['num_characters_output'])

### Train, test, validation split

In [None]:
modeling_med_quad_df = viz_med_quad_df #med_quad_df

In [None]:
# Split the dataset into 60% training and 40% temporary set
med_quad_train, temp_set = train_test_split(modeling_med_quad_df, test_size=0.4, random_state=SEED)

# Split the temporary set into 50% validation and 50% test
med_quad_valid, med_quad_test = train_test_split(temp_set, test_size=0.5, random_state=SEED)

# Print the sizes of the resulting sets
print("Training set size:", len(med_quad_train))
print("Validation set size:", len(med_quad_valid))
print("Test set size:", len(med_quad_test))

In [None]:
modeling_med_quad_df.to_csv('cleaned_pubmed_qa_all.csv', index=False)
med_quad_train.to_csv('cleaned_pubmed_qa_train.csv', index=False)
med_quad_valid.to_csv('cleaned_pubmed_qa_valid.csv', index=False)
med_quad_test.to_csv('cleaned_pubmed_qa_test.csv', index=False)