In [4]:
import re
from pathlib import Path
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns
sns.set()



def parse_fasttext(path):
    """Parse fastText-like or TSV lines into a DataFrame with `label` and `text`.
    Supports lines starting with __label__LABEL text..., or tab-separated label\ttext, or 'LABEL text'."""
    labels = []
    texts = []
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"File not found: {path}")
    with p.open('r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith('__label__'):
                parts = line.split()
                label = parts[0].replace('__label__', '')
                text = ' '.join(parts[1:]) if len(parts) > 1 else ''
            elif '\t' in line:
                label, text = line.split('\t', 1)
            else:
                parts = line.split()
                label = parts[0] if parts else ''
                text = ' '.join(parts[1:]) if len(parts) > 1 else ''
            labels.append(label)
            texts.append(text)
    return pd.DataFrame({'label': labels, 'text': texts})



def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ''
    s = s.lower()
    s = re.sub(r'http\S+', ' ', s)
    s = re.sub(r'<[^>]+>', ' ', s)
    s = re.sub(r"[^a-z0-9\s']", ' ', s)
    s = re.sub(r"\s+", ' ', s).strip()
    return s



def preprocess_df(df, text_col='text', label_col='label', do_lemmatize=False):
    """Apply cleaning, add tokens/lengths, and optionally lemmatize (requires spaCy).
    Returns a copy with `text_clean`, `tokens`, and `length` columns."""
    out = df.copy()
    out['text_clean'] = out[text_col].fillna('').astype(str).map(clean_text)
    out['length'] = out['text_clean'].str.len()


    out['tokens'] = out['text_clean'].str.split()

    if do_lemmatize:
        try:
            import spacy
            nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
            def lemmatize_list(tokens):
                doc = nlp(' '.join(tokens))
                return [t.lemma_ for t in doc]
            out['lemmas'] = out['tokens'].map(lemmatize_list)
        except Exception as e:
            print('spaCy lemmatization unavailable:', e)
            out['lemmas'] = out['tokens']

    return out



def save_cleaned(df, path='cleaned_data.csv'):
    df.to_csv(path, index=False)
    print(f'Saved cleaned data to {path}')

def save_splits(df, label_col='label', test_size=0.2, random_state=42,
                out_dir='splits'):
    from sklearn.model_selection import train_test_split
    X = df
    y = df[label_col].fillna('')
    stratify = y if y.nunique() > 1 else None
    X_train, X_val = train_test_split(X, test_size=test_size, random_state=random_state, stratify=stratify)
    d = Path(out_dir)
    d.mkdir(exist_ok=True)
    train_path = d / 'train.csv'
    val_path = d / 'validation.csv'
    X_train.to_csv(train_path, index=False)
    X_val.to_csv(val_path, index=False)
    print(f'Saved train ({len(X_train)}) -> {train_path} and validation ({len(X_val)}) -> {val_path}')
    return train_path, val_path


df = parse_fasttext('test.ft.txt')
print(df.shape)
print(df['label'].value_counts())
df_clean = preprocess_df(df, do_lemmatize=False)
display(df_clean.head())
save_cleaned(df_clean, 'cleaned_data.csv')
save_splits(df_clean, out_dir='splits')


(400000, 2)
label
2    200000
1    200000
Name: count, dtype: int64


Unnamed: 0,label,text,text_clean,length,tokens
0,2,Great CD: My lovely Pat has one of the GREAT v...,great cd my lovely pat has one of the great vo...,513,"[great, cd, my, lovely, pat, has, one, of, the..."
1,2,One of the best game music soundtracks - for a...,one of the best game music soundtracks for a g...,798,"[one, of, the, best, game, music, soundtracks,..."
2,1,Batteries died within a year ...: I bought thi...,batteries died within a year i bought this cha...,323,"[batteries, died, within, a, year, i, bought, ..."
3,2,"works fine, but Maha Energy is better: Check o...",works fine but maha energy is better check out...,221,"[works, fine, but, maha, energy, is, better, c..."
4,2,Great for the non-audiophile: Reviewed quite a...,great for the non audiophile reviewed quite a ...,403,"[great, for, the, non, audiophile, reviewed, q..."


Saved cleaned data to cleaned_data.csv
Saved train (320000) -> splits\train.csv and validation (80000) -> splits\validation.csv


(WindowsPath('splits/train.csv'), WindowsPath('splits/validation.csv'))