In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls

drive  sample_data


In [3]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/NLP/Week 3/')

In [4]:
!ls

2_Reddit_Reviews_S25.ipynb  __pycache__		   Rohan_file1_hw2.ipynb
FeaturizerSpacy.py	    reddit_200k_train.csv  Untitled0.ipynb


# 1. Loading the dataset

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from FeaturizerSpacy import ManualFeatures
from scipy.sparse import hstack

In [7]:
df = pd.read_csv('reddit_200k_train.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0.1,Unnamed: 0,body,score.x,parent_id.x,id,created_utc.x,retrieved_on,REMOVED
0,1,I've always been taught it emerged from the ea...,2,t3_81u15i,dv551g6,1520121101,1524782256,False
1,2,"As an ECE, my first feeling as ""HEY THAT'S NOT...",2,t3_72sk35,dnl66g6,1506533157,1507150439,True
2,3,Monday: Drug companies stock dives on good new...,5,t3_8o88yr,e02sjhz,1528087570,1532170350,True
3,4,i learned that all hybrids are unfertile i won...,0,t3_6xg9t8,dmfojjp,1504290041,1506407514,False
4,5,Well i was wanting to get wasted tonight. Not...,3,t3_99wi9m,e4rtew8,1535140675,1537893540,False


In [8]:
# sum of null values
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
body,0
score.x,0
parent_id.x,0
id,0
created_utc.x,0
retrieved_on,0
REMOVED,0


In [10]:
# only necessary columns
df = df[['body','REMOVED']]
df.head()

Unnamed: 0,body,REMOVED
0,I've always been taught it emerged from the ea...,False
1,"As an ECE, my first feeling as ""HEY THAT'S NOT...",True
2,Monday: Drug companies stock dives on good new...,True
3,i learned that all hybrids are unfertile i won...,False
4,Well i was wanting to get wasted tonight. Not...,False


In [11]:
# split into the larger dataset
large_df = df.sample(frac=0.4, random_state=42)

# 2. Training the best pipeline i.e., TF-IDF + Feature Engineering

In [13]:
# use the appropriate spaCy model
spacy_model = "en_core_web_sm"

# initialize the ManualFeatures transformer
manual_featurizer = ManualFeatures(
    spacy_model=spacy_model,
    pos_features=True,
    ner_features=True,
    text_descriptive_features=True
)

In [14]:
# extracting features using the transformer
X_features_large, feature_names = manual_featurizer.transform(large_df['body'].tolist())
X_features_large = pd.DataFrame(X_features_large, columns=feature_names)

In [16]:
tfidf = TfidfVectorizer(min_df=5, max_df=0.7, max_features=5000, ngram_range=(1, 2))

# Combine with TF-IDF features
X_tfidf_large = tfidf.fit_transform(large_df['body'])
X_combined_large = hstack([X_tfidf_large, X_features_large])

In [18]:
# Train-test split
X_train_large, X_test_large, y_train_large, y_test_large = train_test_split(X_combined_large, large_df['REMOVED'], test_size=0.2, random_state=42)

In [19]:
# Train XGBoost on combined features
model = XGBClassifier(scale_pos_weight=(y_train_large == 0).sum() / (y_train_large == 1).sum(), random_state=42)
model.fit(X_train_large, y_train_large)

In [20]:
# Evaluate the metrics
y_pred_large = model.predict(X_test_large)
print("F1-Score (Final Model):", f1_score(y_test_large, y_pred_large))

F1-Score (Final Model): 0.647324306898775


# 3. Hyperparameter tuning

In [21]:
# import the necessary libraries
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

In [22]:
# Hyperparameter tuning
param_dist = {
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5)
}

In [23]:
search = RandomizedSearchCV(model, param_dist, n_iter=10, scoring='f1', random_state=42)
search.fit(X_train_large, y_train_large)

In [24]:
print("Best Parameters:", search.best_params_)

Best Parameters: {'learning_rate': 0.1012726728878613, 'max_depth': 8, 'subsample': 0.5035331526098588}


# 4. Final evaluation

In [25]:
# Train final model with best parameters
final_model = XGBClassifier(**search.best_params_, scale_pos_weight=(y_train_large == 0).sum() / (y_train_large == 1).sum(), random_state=42)
final_model.fit(X_train_large, y_train_large)

In [26]:
# Evaluate
y_pred_final = final_model.predict(X_test_large)
print("F1-Score (Final Model):", f1_score(y_test_large, y_pred_final))

F1-Score (Final Model): 0.6425882918234964
