In [2]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

In [13]:
df = pd.read_csv('../data/train.csv')

In [14]:
print(df.columns) # 'severe_toxicity', 'obscene', 'identity_attack', 'insult'
# 'identity_annotator_count', 'toxicity_annotator_count', 'id', 'split', 'created_date', 'publication_id', 'parent_id', 'article_id',

Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count'],
      dtype='object')


In [3]:
# EDA

print(df['toxicity'].head())

print("\n")

print(df.columns)

print("\n")

print(np.min(df['toxicity']), np.max(df['toxicity']))


for idx, text in df["comment_text"].head(5).items():
    print(idx, text)

0    0.000000
1    0.000000
2    0.714286
3    0.000000
4    0.000000
Name: toxicity, dtype: float64


Index(['id', 'comment_text', 'split', 'created_date', 'publication_id',
       'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'toxicity', 'severe_toxicity', 'obscene', 'sexual_explicit',
       'identity_attack', 'insult', 'threat', 'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability', 'identity_annotator_count',
       'toxicity_annotator_count'],
      dtype='object')


0.0 1.0
0 That was the reason Walker fire everyone now what ?.
1 So my original statement still s

## SVC

In [4]:
# making label of toxicity feature into binary from continuous (0.0 to 1.0)

df["comment_text"] = df["comment_text"].fillna("").astype(str)
df["label"] = (df["toxicity"] >= 0.5).astype(int)

In [5]:
# chosen columns
cols = ['rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'sexual_explicit',
       'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability']

df[cols] = df[cols].apply(pd.to_numeric, errors="coerce")
df[cols] = df[cols].fillna(0)

In [6]:
train_idx, val_idx = train_test_split(
    df.index,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

In [7]:
y_train = df.loc[train_idx, "label"]
y_val   = df.loc[val_idx, "label"]

### Just text

In [8]:
# train test split (already done in data_prep.py and data_cleaning.py in ./scripts)
X_train_text = df.loc[train_idx, "comment_text"]
X_val_text   = df.loc[val_idx, "comment_text"]

In [9]:
# just LinearSVC with text in X_train or X_test
clf_text = Pipeline([
    ("tfidf", TfidfVectorizer( # strings to numeric
        stop_words="english",
        ngram_range=(1, 2),
        min_df=5,
        max_df=0.9
    )),
    ("svm", LinearSVC(random_state=42)) # regular linearSVC algorithm
])

clf_text.fit(X_train_text, y_train)
y_pred_text = clf_text.predict(X_val_text)

print(accuracy_score(y_val, y_pred_text))
print(classification_report(y_val, y_pred_text))


0.9469028484979198
              precision    recall  f1-score   support

           0       0.96      0.99      0.97    294421
           1       0.76      0.49      0.60     25502

    accuracy                           0.95    319923
   macro avg       0.86      0.74      0.78    319923
weighted avg       0.94      0.95      0.94    319923



### Other columns and text

In [10]:
X_train_both = df.loc[train_idx, ["comment_text"] + cols]
X_val_both   = df.loc[val_idx, ["comment_text"] + cols]

In [53]:
# LinearSVC with text and other columns
preprocess = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2)
        ), "comment_text"),
        ("num", StandardScaler(), cols),
    ]
)

clf_both = Pipeline([
    ("preprocess", preprocess),
    ("svm", LinearSVC(random_state=42))
])

clf_both.fit(X_train_both, y_train)
y_pred_both = clf_both.predict(X_val_both)

print(accuracy_score(y_val, y_pred_both))
print(classification_report(y_val, y_pred_both))



0.949587869581118
              precision    recall  f1-score   support

           0       0.96      0.99      0.97    294421
           1       0.76      0.54      0.63     25502

    accuracy                           0.95    319923
   macro avg       0.86      0.76      0.80    319923
weighted avg       0.94      0.95      0.95    319923



### Just other columns

In [11]:
X_train_num = df.loc[train_idx, cols]
X_val_num   = df.loc[val_idx, cols]

In [12]:
# LinearSVC withOUT text and just other columns
clf_num = Pipeline([
    ("ss", StandardScaler()),
    ("svm", LinearSVC(random_state=42))
])

clf_num.fit(X_train_num, y_train)
y_pred_num = clf_num.predict(X_val_num)

print(accuracy_score(y_val, y_pred_num))
print(classification_report(y_val, y_pred_num))

0.922753287509807
              precision    recall  f1-score   support

           0       0.92      1.00      0.96    294421
           1       0.73      0.05      0.09     25502

    accuracy                           0.92    319923
   macro avg       0.83      0.52      0.53    319923
weighted avg       0.91      0.92      0.89    319923



## Optimizing just Both Comment and Columns SVC:

In [13]:
# proprocess and pipeline build
preprocess = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(
            stop_words="english"
        ), "comment_text"),
        ("num", StandardScaler(), cols),
    ]
)

base_svm_both = Pipeline([
    ("preprocess", preprocess),
    ("svm", LinearSVC(random_state=42, max_iter=10000))
])

In [14]:
# Hyperparameter grid for combined model
param_grid_both = {
    "preprocess__text__ngram_range": [(1, 1), (1, 2)],
    "preprocess__text__min_df": [2, 5, 10],
    "svm__C": [0.1, 1, 5]
}

# actual optimization through GridSearchCV
grid_both = GridSearchCV(
    estimator=base_svm_both,
    param_grid=param_grid_both,
    cv=3, 
    scoring="f1",
    n_jobs=-1,
    verbose=2
)

# X_train_both and y_train from  unified split
grid_both.fit(X_train_both, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits




0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'preprocess__text__min_df': [2, 5, ...], 'preprocess__text__ngram_range': [(1, ...), (1, ...)], 'svm__C': [0.1, 1, ...]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('text', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [16]:
# print the parameters and scores for best parameters
print("Best params (combined SVM):", grid_both.best_params_)
print("Best CV F1 (combined SVM):", grid_both.best_score_)

best_svm_both = grid_both.best_estimator_

# Evaluate on the validation set
y_val_pred_both = best_svm_both.predict(X_val_both)

print("Validation accuracy (combined, tuned):", accuracy_score(y_val, y_val_pred_both))
print(classification_report(y_val, y_val_pred_both))

Best params (combined SVM): {'preprocess__text__min_df': 10, 'preprocess__text__ngram_range': (1, 1), 'svm__C': 1}
Best CV F1 (combined SVM): 0.5827025890088314
Validation accuracy (combined, tuned): 0.9443350463474377
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     38311
           1       0.74      0.47      0.57      3331

    accuracy                           0.94     41642
   macro avg       0.85      0.73      0.77     41642
weighted avg       0.94      0.94      0.94     41642



## Best model for SVC: 
### (in case this is the best prediction model of of the 4)

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [18]:
# full train set:
train_df = pd.read_csv('../data/train.csv')

# full test set: 
test_df = pd.read_csv('../data/test.csv')

In [None]:
# same cols:
cols = ['rating', 'funny', 'wow', 'sad', 'likes',
       'disagree', 'sexual_explicit',
       'male', 'female', 'transgender',
       'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian', 'bisexual',
       'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu',
       'buddhist', 'atheist', 'other_religion', 'black', 'white', 'asian',
       'latino', 'other_race_or_ethnicity', 'physical_disability',
       'intellectual_or_learning_disability', 'psychiatric_or_mental_illness',
       'other_disability']


# train data:
train_df["comment_text"] = train_df["comment_text"].fillna("").astype(str)
train_df["label"] = (train_df["toxicity"] >= 0.5).astype(int)
train_df[cols] = train_df[cols].apply(pd.to_numeric, errors="coerce").fillna(0)

X_train_full = train_df[["comment_text"] + cols]
y_train_full = train_df["label"]


# test data:
test_df["comment_text"] = test_df["comment_text"].fillna("").astype(str)
test_df["label"] = (test_df["toxicity"] >= 0.5).astype(int)
test_df[cols] = test_df[cols].apply(pd.to_numeric, errors="coerce").fillna(0)

X_test_final = test_df[["comment_text"] + cols]
y_test_final = test_df["label"]

In [None]:
# final model
final_preprocess = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 1),
            min_df=10
        ), "comment_text"),
        ("num", StandardScaler(), cols),
    ]
)

final_svm = Pipeline([
    ("preprocess", final_preprocess),
    ("svm", LinearSVC(C=1, random_state=42, max_iter=10000))
])

In [None]:
# fit final model on ALL training data:
final_svm.fit(X_train_full, y_train_full)

In [None]:
# predict & scores:
y_test_pred = final_svm.predict(X_test_final)

print("Final test accuracy:", accuracy_score(y_test_final, y_test_pred))
print(classification_report(y_test_final, y_test_pred))