In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
train_filename = '/content/drive/MyDrive/DS301_Final_Project/data/train_raw.tsv'
test_filename = '/content/drive/MyDrive/DS301_Final_Project/data/test_raw.tsv'
print(f"Loading from: {train_filename} and {test_filename}")

Loading from: /content/drive/MyDrive/DS301_Final_Project/data/train_raw.tsv and /content/drive/MyDrive/DS301_Final_Project/data/test_raw.tsv


In [None]:
import pandas as pd

In [12]:
# Adjusted column names (includes numeric index at start of file)
columns = [
    "index", "id", "label", "statement", "subject", "speaker", "speaker_job", "state_info",
    "party_affiliation", "barely_true_counts", "false_counts", "half_true_counts",
    "mostly_true_counts", "pants_on_fire_counts", "context", "justification"
]

# Load and process
train_df = pd.read_csv(train_filename, sep='\t', names=columns)
test_df = pd.read_csv(test_filename, sep='\t', names=columns)


# Map to binary
label_mapping = {
    'pants-fire': 'False', 'FALSE': 'False', 'barely-true': 'False',
    'half-true': 'True', 'mostly-true': 'True', 'TRUE': 'True'
}

train_df['binary_label'] = train_df['label'].map(label_mapping)
test_df['binary_label'] = test_df['label'].map(label_mapping)
train_binary_df = train_df[['id', 'statement', 'binary_label', 'subject', 'speaker']].dropna()
test_binary_df = test_df[['id', 'statement', 'binary_label', 'subject', 'speaker']].dropna()

print(f"Train dataset: {len(train_binary_df)} samples")
print(train_binary_df['binary_label'].value_counts())

print(f"Test dataset: {len(test_binary_df)} samples")
print(test_binary_df['binary_label'].value_counts())

# Save to Google Drive instead of downloading
train_binary_df.to_csv('/content/drive/MyDrive/DS301_Final_Project/data/train_binary.csv', index=False)
test_binary_df.to_csv('/content/drive/MyDrive/DS301_Final_Project/data/test_binary.csv', index=False)

print("Saved train_binary_df and test_binary_df to Google Drive.")

Train dataset: 6569 samples
binary_label
True     4076
False    2493
Name: count, dtype: int64
Test dataset: 810 samples
binary_label
True     506
False    304
Name: count, dtype: int64
Saved train_binary_df and test_binary_df to Google Drive.
