In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
# download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/affy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/affy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# cleaning functions
def clean_html(text):
    return BeautifulSoup(text, "html.parser").get_text()

def clean_text(text):
    text = re.sub(r'\n|\r', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

def clean_encoded_chars(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

def full_clean_pipeline(text):
    text = clean_html(text)
    text = clean_text(text)
    text = remove_stopwords(text)
    text = clean_encoded_chars(text)
    text = lemmatize_text(text)
    return text.lower()


In [4]:
# load data
df = pd.read_csv("dataset/data.csv")

# create combined column
df['combined'] = df['title'] + ' ' + df['text']

# clean the combined column
df['clean_combined'] = df['combined'].apply(full_clean_pipeline)

# save label (separately)
df_label = df['label']

  return BeautifulSoup(text, "html.parser").get_text()


In [5]:
# apply TF-IDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['clean_combined']).toarray()

# convert to DataFrame
X_tfidf_df = pd.DataFrame(X_tfidf, columns=tfidf.get_feature_names_out())

In [6]:
# one-hot encode subject
subject_hot = pd.get_dummies(df['subject'], prefix='subject', dtype=int)

In [7]:
# concatenate features
X_features = pd.concat([X_tfidf_df, subject_hot.reset_index(drop=True)], axis=1)

In [8]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_features, df_label, test_size=0.2, random_state=42
)