## 2A. Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import re
import string
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
from collections import Counter

tqdm.pandas()

In [2]:
df = pd.read_csv('processed data\sl_all_text_analyzed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32727 entries, 0 to 32726
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                32727 non-null  object 
 1   post_title        32727 non-null  object 
 2   type              32727 non-null  object 
 3   body              27217 non-null  object 
 4   score             32727 non-null  int64  
 5   url               32727 non-null  object 
 6   created_utc       32727 non-null  float64
 7   text              32727 non-null  object 
 8   word_count        32727 non-null  int64  
 9   char_count        32727 non-null  int64  
 10  created_datetime  32727 non-null  object 
 11  date              32727 non-null  object 
 12  year              32727 non-null  int64  
 13  month             32727 non-null  object 
 14  day               32727 non-null  int64  
 15  dow               32727 non-null  object 
 16  hour              32727 non-null  int64 

In [3]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    
    # Remove emojis
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    
    # Remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    # Convert to lowercase
    text = text.lower()
    
    return text

In [4]:
df["clean_text"] = df["text"].progress_apply(clean_text)

100%|██████████| 32727/32727 [00:01<00:00, 19658.08it/s]


In [5]:
df.drop("word_count", axis=1, inplace=True)
df.drop("char_count", axis=1, inplace=True)

df["word_count"] = df["clean_text"].apply(lambda x: len(str(x).split()))
df["char_count"] = df["clean_text"].apply(lambda x: len(str(x)))

## Remove posts below and above thresholds

In [6]:
LOWER_THRESHOLD = 20
UPPER_THRESHOLD = 5000

df_filtered = df[(df["word_count"] >= LOWER_THRESHOLD) & 
                 (df["word_count"] <= UPPER_THRESHOLD)].copy()

print("Before filter dataset size:", len(df))
print("After filter dataset size:", len(df_filtered))
print("Removed dataset size:", len(df) - len(df_filtered))


all_words = " ".join(df_filtered["clean_text"]).split()
total_words = len(all_words)
unique_words = len(set(all_words))

print("Total words in corpus:", total_words)
print("Unique words in corpus:", unique_words)

Before filter dataset size: 32727
After filter dataset size: 26187
Removed dataset size: 6540
Total words in corpus: 2941831
Unique words in corpus: 67136


## 2B. Traditional tokenization

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

tqdm.pandas()

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nisal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nisal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nisal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# --- i. Traditional regex-based tokenization ---
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def regex_tokenize(text):
    # Tokenize using regex (word characters)
    tokens = nltk.word_tokenize(text)
    
    # Lowercase and remove stopwords
    tokens = [t.lower() for t in tokens if t.isalpha() and t.lower() not in stop_words]
    
    # Lemmatize
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    return tokens

In [9]:
df_filtered["tokens_regex"] = df_filtered["clean_text"].progress_apply(regex_tokenize)

# Save tokenized posts
df_filtered[["tokens_regex"]].to_pickle("models/tokens_regex.pkl")

100%|██████████| 26187/26187 [00:21<00:00, 1211.64it/s]


## Sub-word Tokenization

In [10]:
# --- ii. Sub-word tokenization ---
from tokenizers import ByteLevelBPETokenizer
from tokenizers import BertWordPieceTokenizer
import sentencepiece as spm
import math

In [11]:
corpus_file = "models/clean_text_corpus.txt"
vocab_size=30000
min_frequency=2
special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]

with open(corpus_file, "w", encoding="utf-8") as f:
    for text in df_filtered["clean_text"]:
        f.write(text + "\n")

In [12]:
# BPE Tokenizer
bpe_tokenizer = ByteLevelBPETokenizer()
bpe_tokenizer.train(files=corpus_file, 
                    vocab_size=vocab_size, 
                    min_frequency=min_frequency, 
                    special_tokens=special_tokens)
bpe_tokenizer.save_model("models")

['models\\vocab.json', 'models\\merges.txt']

In [13]:
# WordPiece Tokenizer (BERT style)
wp_tokenizer = BertWordPieceTokenizer(lowercase=True)
wp_tokenizer.train(files=corpus_file, 
                    vocab_size=vocab_size, 
                    min_frequency=min_frequency, 
                    special_tokens=special_tokens)
wp_tokenizer.save_model("models")

['models\\vocab.txt']

In [14]:
# SentencePiece Tokenizer
spm.SentencePieceTrainer.Train(input=corpus_file,
                               model_prefix="models/sentencepiece",
                               vocab_size=vocab_size,
                               character_coverage=1.0,
                               model_type='bpe')  # can also use 'unigram' or 'word'

## 2C. Evaluate tokenization schemes

In [15]:
tokens_regex = df_filtered["tokens_regex"].tolist()

In [16]:
def corpus_stats(token_lists):
    all_tokens = [t for tokens in token_lists for t in tokens]
    total_words = len(all_tokens)
    unique_words = len(set(all_tokens))
    return total_words, unique_words

# --- Regex-based stats ---
total_words_regex, unique_words_regex = corpus_stats(tokens_regex)
print(f"Regex-based tokenization: Total words = {total_words_regex}, Unique words = {unique_words_regex}")

Regex-based tokenization: Total words = 1526299, Unique words = 54757


In [17]:
bpe_tokenizer = ByteLevelBPETokenizer("models/vocab.json", "models/merges.txt")
wp_tokenizer = BertWordPieceTokenizer("models/vocab.txt", lowercase=True)
sp_model = spm.SentencePieceProcessor(model_file="models/sentencepiece.model")

In [18]:
# --- Tokenize corpus with sub-word tokenizers ---
def tokenize_bpe(text):
    return bpe_tokenizer.encode(text).tokens

def tokenize_wp(text):
    return wp_tokenizer.encode(text).tokens

def tokenize_sp(text):
    return sp_model.encode(text, out_type=str)

# Tokenized lists
tokens_bpe = df_filtered["clean_text"].progress_apply(tokenize_bpe).tolist()
tokens_wp = df_filtered["clean_text"].progress_apply(tokenize_wp).tolist()
tokens_sp = df_filtered["clean_text"].progress_apply(tokenize_sp).tolist()

100%|██████████| 26187/26187 [00:09<00:00, 2874.46it/s]
100%|██████████| 26187/26187 [00:09<00:00, 2905.89it/s]
100%|██████████| 26187/26187 [00:06<00:00, 4113.63it/s]


In [19]:
# Stats
total_words_bpe, unique_words_bpe = corpus_stats(tokens_bpe)
total_words_wp, unique_words_wp = corpus_stats(tokens_wp)
total_words_sp, unique_words_sp = corpus_stats(tokens_sp)

print(f"BPE: Total words = {total_words_bpe}, Unique words = {unique_words_bpe}")
print(f"WordPiece: Total words = {total_words_wp}, Unique words = {unique_words_wp}")
print(f"SentencePiece: Total words = {total_words_sp}, Unique words = {unique_words_sp}")

BPE: Total words = 3050607, Unique words = 28695
WordPiece: Total words = 3084599, Unique words = 28206
SentencePiece: Total words = 3049111, Unique words = 28771


In [20]:
def unigram_probs(token_lists):
    all_tokens = [t for tokens in token_lists for t in tokens]
    token_counts = Counter(all_tokens)
    total = sum(token_counts.values())
    probs = {tok: count/total for tok, count in token_counts.items()}
    return probs

In [21]:
def perplexity(token_lists, probs):
    N = sum(len(tokens) for tokens in token_lists)
    log_prob_sum = 0
    for tokens in token_lists:
        for t in tokens:
            p = probs.get(t, 1e-8)  # smoothing
            log_prob_sum += math.log(p)
    ppl = math.exp(-log_prob_sum / N)
    return ppl

In [22]:
# Calculate perplexity for each scheme
ppl_regex = perplexity(tokens_regex, unigram_probs(tokens_regex))
ppl_bpe = perplexity(tokens_bpe, unigram_probs(tokens_bpe))
ppl_wp = perplexity(tokens_wp, unigram_probs(tokens_wp))
ppl_sp = perplexity(tokens_sp, unigram_probs(tokens_sp))

perplexities = {
    "Regex": ppl_regex,
    "BPE": ppl_bpe,
    "WordPiece": ppl_wp,
    "SentencePiece": ppl_sp
}

for name, val in sorted(perplexities.items(), key=lambda x: x[1], reverse=True):
    print(f"{name}: {val:.2f}")

Regex: 3664.22
BPE: 1394.05
SentencePiece: 1352.82
WordPiece: 1327.60


In [23]:
df_filtered.drop(['post_title', 'body', 'score','url','text','created_datetime','date','year','month','day','dow','hour','tokens_regex'], axis=1, inplace=True)

In [24]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26187 entries, 0 to 32726
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           26187 non-null  object 
 1   type         26187 non-null  object 
 2   created_utc  26187 non-null  float64
 3   clean_text   26187 non-null  object 
 4   word_count   26187 non-null  int64  
 5   char_count   26187 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 1.4+ MB


In [25]:
df_filtered.to_csv("cleaned data\sl_all_text_cleaned.csv", index=False, encoding="utf-8")