# Natural Language Processing


## 0 Setup

In [54]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import xgboost as xgb

from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [55]:
import re
import unicodedata

In [56]:
seeds = 42

In [57]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 2000)

## 1 Data Preprocessing


In [58]:
data = pd.read_csv(r"C:\Users\black\Documents\Ironhack\Week_7\project\training_data.csv", header = None, sep = '\t')
data.columns = ["label", "text"]

In [59]:
data.shape

(34152, 2)

In [60]:
data.iloc[11]

label                                                                                         0
text     paul ryan just told us he doesn‚t care about struggling families living in blue states
Name: 11, dtype: object

In [61]:
CENSORED_PATTERN = re.compile(
    r'(?<!\w)'          # word start -> prev char is not a word char, e.g. space
    r'(?=\w*[A-Za-z])'  # word must contain at least one letter
    r'\w*[*]+'          # any letters with one or more *-chars
    r'\w*'              # optional letters after *-char
    r'(?!\w)',          # word end -> next char is not a word char, e.g. space
    flags=re.UNICODE
)

NUMBER_PATTERN = re.compile(r'\b\d+([\.,]\d+)?\b')

def preprocess_text(text):
    if pd.isnull(text):
        return ""
    
    # lower case
    text = text.lower()

    # normalie punctuation
    text = unicodedata.normalize("NFKC", text)

    # replace censored words
    text = CENSORED_PATTERN.sub('censored_slur', text)

    # remove extra whiste space
    text = re.sub(r'\s+', ' ', text).strip()

    # norm num 
    text = NUMBER_PATTERN.sub("num", text)

    return text

In [62]:
# apply preprocessing
data["text"] = data["text"].apply(preprocess_text)

In [None]:
# further preprocessing possibilities:

# removing stop words
# removing numbers
# removing punctuation
# lemmatization

In [64]:
# split data

train_text, test_text, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.3, random_state=42)

In [None]:
y_train.value_counts()

# almost evenly distributed dataset -> next to accuracy, F1 score check

label
0    12277
1    11629
Name: count, dtype: int64

## 2 Evaluation Function

In [66]:
# list to store all models scores
results = []   

# evaluation function 
def evaluate_model(name, vectorizer, classifier, X_train_raw, y_train, X_test_raw, y_test):
    # Vectorize
    X_train = vectorizer.fit_transform(X_train_raw)
    X_test = vectorizer.transform(X_test_raw)
    
    # Train
    classifier.fit(X_train, y_train)
    
    # Predict
    preds = classifier.predict(X_test)
    
    # Evaluate
    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    
    # Append results
    results.append({
        "model": name,
        "accuracy": acc,
        "f1_score": f1
    })

def show_results():
    return pd.DataFrame(results).sort_values("f1_score", ascending=False)

## 3 Vectorization 

### 3.0 Custom Tokenizer

In [67]:
CENSORED_PATTERN = re.compile(r'(?<!\w)(?=\w*[A-Za-z])\w*[*]+[A-Za-z]*\b')
NUMBER_PATTERN = re.compile(r'\b\d+([.,]\d+)?\b')
WORD_RE = re.compile(r"\b\w[\w'-]*\b")

def custom_tokenizer(text):
    text = text.lower()
    text = unicodedata.normalize("NFKC", text)
    text = CENSORED_PATTERN.sub("censored_slur", text)
    text = NUMBER_PATTERN.sub("num", text)
    tokens = WORD_RE.findall(text)
    cleaned_tokens = [tok.strip("-'") for tok in tokens if tok]
    return cleaned_tokens

### 3.1 TF-ID 

In [68]:

tfidf = TfidfVectorizer(
    tokenizer=custom_tokenizer,
    lowercase=False,
    token_pattern=None,
    ngram_range=(1,2),
    min_df=2,
    max_df=0.9
)

### 3.2 Bag of Words 

In [69]:
bow = CountVectorizer(
    tokenizer=custom_tokenizer,
    lowercase=False,
    token_pattern=None,
    ngram_range=(1,2), 
    max_df=0.9, 
    min_df=0.1
) 


### 3.3 TF-IDF Char


In [70]:
tfidf_char = TfidfVectorizer(
    analyzer="char",
    lowercase=False, 
    sublinear_tf=True,
    ngram_range=(3,6),
    min_df=2,
    max_df=0.9    
)

## 4 Model 

In [71]:
# RandomForestClassifier
rf_baseline = RandomForestClassifier(
    n_estimators=200,
    max_depth=50,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

In [72]:
# Logistic Regression
logreg_baseline = LogisticRegression(
    max_iter=2000,       
    solver='lbfgs',      
    class_weight='balanced',
    random_state=42
)


In [73]:
# SVM 
svm_baseline = LinearSVC(
    max_iter=5000,
    class_weight='balanced',
    random_state=42
)

In [74]:
# MNB
mnb_baseline = MultinomialNB(
    alpha=1.0,  
)

In [75]:
# xgb
xgb_baseline = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

## 5 Train & Evaluate

In [76]:
# RFC
evaluate_model("RandomForest + BoW", bow, rf_baseline, train_text, y_train, test_text, y_test)
evaluate_model("RandomForest + TF-IDF", tfidf, rf_baseline, train_text, y_train, test_text, y_test)
evaluate_model("RandomForest + TF-IDF_char", tfidf_char, rf_baseline, train_text, y_train, test_text, y_test)

In [77]:
# LogReg
evaluate_model("LogReg + TF-IDF", tfidf, logreg_baseline, train_text, y_train, test_text, y_test)
evaluate_model("LogReg + BoW", bow, logreg_baseline, train_text, y_train, test_text, y_test)
evaluate_model("LogReg + TF-IDF_char", tfidf_char, rf_baseline, train_text, y_train, test_text, y_test)

In [78]:
# SVM
evaluate_model("LinearSVC + TF-IDF", tfidf, svm_baseline, train_text, y_train, test_text, y_test)
evaluate_model("LinearSVC + BoW", bow, svm_baseline, train_text, y_train, test_text, y_test)
evaluate_model("LinearSVC + TF-IDF_char", tfidf_char, rf_baseline, train_text, y_train, test_text, y_test)

In [79]:
# MNB
evaluate_model("MultinomialNB + TF-IDF", tfidf, mnb_baseline, train_text, y_train, test_text, y_test)
evaluate_model("MultinomialNB + BoW", bow, mnb_baseline, train_text, y_train, test_text, y_test)
evaluate_model("MultinomialNB + TF-IDF_char", tfidf_char, rf_baseline, train_text, y_train, test_text, y_test)

In [80]:
# XGB
evaluate_model("XGBoost + TF-IDF", tfidf, xgb_baseline, train_text, y_train, test_text, y_test)
evaluate_model("XGBoost + BoW", bow, xgb_baseline, train_text, y_train, test_text, y_test)
evaluate_model("XGBoost + TF-IDF_char", tfidf_char, rf_baseline, train_text, y_train, test_text, y_test)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# 6 Vectorizer & Model Improvement

### 6.1 Grid Search TF-ID_char + SVC

In [81]:
# grid search on tf-idf & SVC best hyperparameters

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(analyzer="char", sublinear_tf=True)),
    ("svc", LinearSVC(class_weight='balanced', max_iter=5000, random_state=42))
])

# parameter grid
param_grid = {
    "tfidf__ngram_range": [(2,5), (3,5), (3,6), (4,7), (5,8)],       
    "tfidf__min_df": [3, 5, 10],                            
    "tfidf__sublinear_tf": [True, False],       
    "svc__C": [0.5, 1.0, 2.0, 4.0],                          
}

grid_svc = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [82]:
# grid fit
grid_svc.fit(train_text, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'svc__C': [0.5, 1.0, ...], 'tfidf__min_df': [3, 5, ...], 'tfidf__ngram_range': [(2, ...), (3, ...), ...], 'tfidf__sublinear_tf': [True, False]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'char'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [83]:
print("Best F1:", grid_svc.best_score_)
print("Best Params:", grid_svc.best_params_)
best_svc_model = grid_svc.best_estimator_

Best F1: 0.9616958653815507
Best Params: {'svc__C': 1.0, 'tfidf__min_df': 3, 'tfidf__ngram_range': (2, 5), 'tfidf__sublinear_tf': True}


In [84]:
# best svc + tf-id model
tfidf_best = TfidfVectorizer(analyzer='char', min_df=5, ngram_range=(2, 5),
                sublinear_tf=True)

svm_best = LinearSVC(class_weight='balanced', max_iter=5000, random_state=42)

In [85]:
evaluate_model("Best(LinearSVC + TF-IDF)", tfidf_best, svm_best, train_text, y_train, test_text, y_test)

## 7 Data further processing

In [86]:
# Confusion Analysis

def confusion_analysis(model, X_test, y_test, raw_text_test):
    # Predict
    preds = model.predict(X_test)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, preds)
    print("Confusion Matrix:")
    print(cm)
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, preds))
    
    # Identify misclassified samples
    misclassified = pd.DataFrame({
        'text': raw_text_test,
        'true_label': y_test,
        'predicted_label': preds
    })
    
    misclassified = misclassified[misclassified['true_label'] != misclassified['predicted_label']]
    
    print(f"\nNumber of misclassified samples: {len(misclassified)}")
    
    # Show top 10 misclassified examples
    print("\nSample misclassified headlines:")
    print(misclassified.head(60))
    
    return misclassified

In [88]:
# best model
best_model = grid_svc.best_estimator_

# confusion analysis
misclassified_samples = confusion_analysis(
    best_model,
    test_text,        
    y_test,
    raw_text_test=test_text
)


Confusion Matrix:
[[5109  186]
 [ 142 4809]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      5295
           1       0.96      0.97      0.97      4951

    accuracy                           0.97     10246
   macro avg       0.97      0.97      0.97     10246
weighted avg       0.97      0.97      0.97     10246


Number of misclassified samples: 328

Sample misclassified headlines:
                                                                                                                                                                             text  true_label  predicted_label
1118                                                                                                         trump hires son‚s wedding planner to run new york housing department           0                1
4534                                                                                                                   

#### Most errors involve:

**political personalities (are mentioned across both models):**
- Trump 
- Obama
- Clinton
- Putin
- Republicans/Democrats
- etc.

→ remove political names or lowering their weight


**some headlines are very short:**
- "hurry"
- "trashy"
- "vietnam vet"
- etc 

→ nothing to analyse → only headers with words >3


**a lot of special punctuation:**
- normalization didn't seem to pick up on everything

→ fine-tune punctutation normalization 

#### 7.1 remove political personalities/institutes

In [89]:
political_names = ["trump", "obama", "clinton", "putin", 
                   "democrats", "republicans", "gop", "senate", "congress"]

def remove_political_names(text):
    tokens = text.split()
    return " ".join([t for t in tokens if t.lower() not in political_names])

In [90]:
train_text_pol = train_text.apply(remove_political_names)
test_text_pol = test_text.apply(remove_political_names)

In [91]:
evaluate_model("-Pol:Best_New(LinearSVC + TF-IDF)", tfidf_best, svm_best, train_text_pol, y_train, test_text_pol, y_test)

#### 7.2 remove very short headlines


In [92]:
mask_train = train_text.str.split().str.len() >= 3

train_text_filtered = train_text[mask_train]
y_train_filtered = y_train[mask_train]

mask_test = test_text.str.split().str.len() >= 3

test_text_filtered = test_text[mask_test]
y_test_filtered = y_test[mask_test]

In [93]:
evaluate_model("Filt:Best_New(LinearSVC + TF-IDF)", tfidf_best, svm_best, train_text_filtered, y_train_filtered, test_text_filtered, y_test_filtered)

#### 7.3 fine tune punctuation normalization

In [94]:
def normalize_quotes(text):
    return text.translate({
        ord('“'): '"',
        ord('”'): '"',
        ord('„'): '"',
        ord('‟'): '"',
        ord('’'): "'",
        ord('‘'): "'",
        ord('‚'): "'",
    })

In [95]:
train_text_norm = train_text.apply(normalize_quotes)
test_text_norm = test_text.apply(normalize_quotes)

In [96]:
evaluate_model("Norm:Best_New(LinearSVC + TF-IDF)", tfidf_best, svm_best, train_text_norm, y_train, test_text_norm, y_test)

## 8 Results

In [97]:
show_results()

Unnamed: 0,model,accuracy,f1_score
17,Filt:Best_New(LinearSVC + TF-IDF),0.968251,0.96745
15,Best(LinearSVC + TF-IDF),0.96789,0.966918
16,-Pol:Best_New(LinearSVC + TF-IDF),0.965157,0.964146
6,LinearSVC + TF-IDF,0.960082,0.958989
18,Norm:Best_New(LinearSVC + TF-IDF),0.954909,0.953474
3,LogReg + TF-IDF,0.951786,0.950718
9,MultinomialNB + TF-IDF,0.94837,0.946267
14,XGBoost + TF-IDF_char,0.93822,0.938382
11,MultinomialNB + TF-IDF_char,0.93822,0.938382
2,RandomForest + TF-IDF_char,0.93822,0.938382


## Saving Best Model

In [98]:
import joblib
joblib.dump(best_model, "best_model.pkl")

['best_model.pkl']

## 9 Testing on Random News (Fake & Real)

In [99]:
testing = {
    "headline": [
        # Fake news (0)
        "shocked parents revolt after school secretly replaces pledge of allegiance with 'world oath' pushed by global elites",
        "breaking: cnn host caught on hot mic admitting network 'creates stories' to damage president — whistleblower drops bombshell",
        "outrage! california city bans american flag at public events to avoid 'offending newcomers' — residents furious",
        "look: antifa members riot again after conservative speaker announces surprise campus visit — chaos erupts",
        "hollywood actor claims he was 'blacklisted' for supporting trump — says studios run by radical left insiders",
        
        # Real news (1)
        "u.s. senate panel advances bipartisan border security bill despite party divisions",
        "world health organization urges nations to expand vaccine access as cases rise globally",
        "federal reserve signals no rate change amid concerns over slowing economic growth",
        "european union agrees on new data-sharing rules after months of negotiations",
        "canadian prime minister announces inquiry into nationwide cyberattack on government services"
    ],
    "label": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
}

small_test = pd.DataFrame(testing)



In [100]:
test = small_test.drop(columns="label")
test

Unnamed: 0,headline
0,shocked parents revolt after school secretly replaces pledge of allegiance with 'world oath' pushed by global elites
1,breaking: cnn host caught on hot mic admitting network 'creates stories' to damage president — whistleblower drops bombshell
2,outrage! california city bans american flag at public events to avoid 'offending newcomers' — residents furious
3,look: antifa members riot again after conservative speaker announces surprise campus visit — chaos erupts
4,hollywood actor claims he was 'blacklisted' for supporting trump — says studios run by radical left insiders
5,u.s. senate panel advances bipartisan border security bill despite party divisions
6,world health organization urges nations to expand vaccine access as cases rise globally
7,federal reserve signals no rate change amid concerns over slowing economic growth
8,european union agrees on new data-sharing rules after months of negotiations
9,canadian prime minister announces inquiry into nationwide cyberattack on government services


In [101]:
pred = best_model.predict(test["headline"])

y_true = small_test["label"]
y_pred = pred

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))

Accuracy: 0.8

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.71      1.00      0.83         5

    accuracy                           0.80        10
   macro avg       0.86      0.80      0.79        10
weighted avg       0.86      0.80      0.79        10


Confusion Matrix:
 [[3 2]
 [0 5]]


## 10 Testing on Test Sample 

In [102]:
test_data = pd.read_csv(r"C:\Users\black\Documents\Ironhack\Week_7\project\testing_data.csv", header = None, sep = '\t')
test_data.columns = ["label", "text"]


In [103]:
pred = best_model.predict(test_data["text"])

In [104]:
test_data["pred_label"] = pred
test_data

Unnamed: 0,label,text,pred_label
0,2,copycat muslim terrorist arrested with assault weapons,0
1,2,wow! chicago protester caught on camera admits violent activity was pre-planned: ‚it‚s not gonna be peaceful‚,0
2,2,germany's fdp look to fill schaeuble's big shoes,1
3,2,mi school sends welcome back packet warning kids against wearing u.s. flag to school,0
4,2,u.n. seeks 'massive' aid boost amid rohingya 'emergency within an emergency',1
...,...,...,...
9979,2,boom! fox news leftist chris wallace attempts trump smear over inauguration crowd size‚fox news‚ brit hume backs up trump on fake news story [video],0
9980,2,here it is: list of democrat hypocrites who voted to filibuster gw bush‚s final supreme court pick,0
9981,2,new fires ravage rohingya villages in northwest myanmar: sources,1
9982,2,meals on wheels shuts the lyin‚ lefties up with truth after moveon.org‚s fake news [video],0


## 11 Extra Step: Transformers

In [106]:
!pip install transformers --upgrade

!pip install torch --upgrade

from transformers import pipeline

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.34.0->transformers)
  Downloading fsspec-2025.10.0-py3-none-any.whl.metadata (10 kB)
Downloading transformers-4.57.1-py3-none-any.whl

In [107]:
pipe = pipeline(
    "text-classification",
    model="jy46604790/Fake-News-Bert-Detect"
)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


vocab.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu


In [112]:
preds = pipe(test["headline"].tolist(), truncation=True)

label_map = {"LABEL_0": 0, "LABEL_1": 1}
y_pred = [label_map[p["label"]] for p in preds]

y_true = small_test["label"]

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10


Confusion Matrix:
 [[5 0]
 [0 5]]
