# The Task

## Project Overview

In the file dataset/data.csv, you will find a dataset containing news articles with the following columns:

label: 0 if the news is fake, 1 if the news is real.
title: The headline of the news article.
text: The full content of the article.
subject: The category or topic of the news.
date: The publication date of the article.
Your goal is to build a classifier that is able to distinguish between the two.

Once you have a classifier built, then use it to predict the labels for dataset/validation_data.csv. Generate a new file where the label 2 has been replaced by 0 (fake) or 1 (real) according to your model. Please respect the original file format, do not include extra columns, and respect the column separator.

Please ensure to split the data.csv into training and test datasets before using it for model training or evaluation.

Guidance
Like in a real life scenario, you are able to make your own choices and text treatment. Use the techniques you have learned and the common packages to process this data and classify the text.

Deliverables
Python Code: Provide well-documented Python code that conducts the analysis.
Predictions: A csv file in the same format as validation_data.csv but with the predicted labels (0 or 1)
Accuracy estimation: Provide the teacher with your estimation of how your model will perform.
Presentation: You will present your model in a 10-minute presentation. Your teacher will provide further instructions.

# Import

In [None]:
import re
import nltk
import string
import numpy as np
import pandas as pd
import seaborn as sb
from nltk import pos_tag
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Loading the Data

In [None]:
data = pd.read_csv('dataset/training_data_lowercase.csv', sep='\t', names=['labels', 'text'])

In [None]:
data

# Understanding the Data

In [None]:
data.info()

In [None]:
print('Its unique values are ',data['labels'].unique())
print(print(data['labels'].describe()))

In [None]:
plt.hist(data.labels, color='red')
plt.show()

In [None]:
data.head()

In [None]:
data.tail()

# Preprocessing the Data

## Basic cleaning 

In [None]:
def clean_html_text(text: str) -> str:
    if text is None:
        return ''
    text = str(text)
    # Remove inline JavaScript/CSS
    text = re.sub(r"(?is)<script.*?>.*?</script>", " ", text)
    text = re.sub(r"(?is)<style.*?>.*?</style>", " ", text)
    # Remove HTML comments
    text = re.sub(r"(?s)<!--.*?-->", " ", text)
    # Remove the remaining tag
    text = re.sub(r"(?s)<[^>]+>", " ", text)
    # Remove prefixed b
    text = re.sub(r"^\s*b[\"'](.+?)[\"']\s*$", r"\1", text)
    # Remove end of the line tags
    text = re.sub(r"\s*[\[\(][^\]\)]+[\]\)]\s*$", "", text)    
    # Remove \t from middle and end of the texts
    text = re.sub(r"\b\\t"," ",text)
    # Remove \t from start of the texts
    text = re.sub(r"^\\t"," ",text)
    # Remove all the special characters and numbers
    text = re.sub(r"[^A-Za-z\s]", " ", text)
    # Remove all single characters
    text = re.sub(r"\b[A-Za-z]\b", " ", text)
    # Remove single characters from the start
    text = re.sub(r"^[A-Za-z]\s+", " ", text)
    # Substitute multiple spaces with single space
    text = re.sub(r"\s+", " ", text).strip()
    # Convert to lowercase
    text = text.lower()
    return text

punct_pattern = f"[{re.escape(string.punctuation)}]"

In [None]:
data['pre_text'] = data['text'].astype(str).apply(lambda x: clean_html_text(x))
data['pre_text'] = data['pre_text'].astype(str).apply(lambda x: re.sub(punct_pattern, "", x))
data['pre_text'] = data['pre_text'].astype(str).apply(lambda x: word_tokenize(x))
data.head()

## Removing stop words

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
data['pre_text'] = data['pre_text'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

In [None]:
bag_of_words = {}

for lista in data['pre_text']:
    for word in lista:
        if bag_of_words == 0:
            bag_of_words[word] = 1
        elif word in bag_of_words:
            bag_of_words[word] +=1
        else:
            bag_of_words[word] = 1

print(sorted(bag_of_words.items(), key=lambda x: -x[1])[:100])

In [None]:
words_to_filter = ['video','says', 'tweets', 'tells','screenshots',
                   'details', 'fck', 'btch', 'images', 'cck', 'image'
                   ,'videos','ahole']

In [None]:
data['pre_text_filter'] = data['pre_text'].apply(lambda tokens: [word for word in tokens if word not in words_to_filter])

In [None]:
bag_of_words = {}

for lista in data['pre_text_filter']:
    for word in lista:
        if bag_of_words == 0:
            bag_of_words[word] = 1
        elif word in bag_of_words:
            bag_of_words[word] +=1
        else:
            bag_of_words[word] = 1

print(sorted(bag_of_words.items(), key=lambda x: -x[1])[:100])

### Using Stemmer

#### Snowball

In [None]:
snowball = SnowballStemmer('english')

In [None]:
data['snow_text'] = data['pre_text'].apply(lambda tokens: [snowball.stem(token) for token in tokens])

#### Porter

In [None]:
porter = PorterStemmer()

In [None]:
data['porter_text'] = data['pre_text'].apply(lambda tokens: [porter.stem(token) for token in tokens])

### Using Lemmatizer

In [None]:
lemm = WordNetLemmatizer()

In [None]:
data['lemm_text'] = data['pre_text'].apply(lambda tokens: [lemm.lemmatize(token) for token in tokens])

### Using Lemmatizer (Verb)

In [None]:
data['lemm_text_verb'] = data['pre_text'].apply(lambda tokens: [lemm.lemmatize(token, pos='v') for token in tokens])

In [None]:
data.info()

# Spliting the data into Training and Test

In [None]:
X = data.iloc[:,2:]

In [None]:
y = data.iloc[:,0]

In [None]:
print(X.shape, y.shape)

## Using only the preprocessed text

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X['pre_text'], y, test_size=0.2, random_state=42)

## Using the preprocessed text + snow stemmer

In [None]:
X_train_snow, X_test_snow, y_train_snow, y_test_snow = train_test_split(X['snow_text'], y, test_size=0.2, random_state=42)

## Using the preprocessed text + porter stemmer

In [None]:
X_train_porter, X_test_porter, y_train_porter, y_test_porter = train_test_split(X['porter_text'], y, test_size=0.2, random_state=42)

## Using the preprocessed text + noise removal

In [None]:
X_train_filt, X_test_filt, y_train_filt, y_test_filt = train_test_split(X['pre_text_filter'], y, test_size=0.2, random_state=42)

## Using the preprocessed text + lemmatizer

In [None]:
X_train_lemm, X_test_lemm, y_train_lemm, y_test_lemm = train_test_split(X['lemm_text'], y, test_size=0.2, random_state=42)

## Using the preprocessed text + lemmatizer (Verbs)

In [None]:
X_train_lemm_verb, X_test_lemm_verb, y_train_lemm_verb, y_test_lemm_verb = train_test_split(X['lemm_text_verb'], y, test_size=0.2, random_state=42)

### Defining a plotting function

In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(
    model,
    X,
    y,
    scoring="accuracy",
    cv=5,
    train_sizes=np.linspace(0.1, 1.0, 5),
    title=None
):
    train_sizes, train_scores, val_scores = learning_curve(
        model,
        X,
        y,
        scoring=scoring,
        cv=cv,
        train_sizes=train_sizes,
        n_jobs=-1
    )

    train_mean = train_scores.mean(axis=1)
    val_mean = val_scores.mean(axis=1)

    plt.figure()
    plt.plot(train_sizes, train_mean, marker="o", label="Training score")
    plt.plot(train_sizes, val_mean, marker="o", label="Validation score")
    plt.xlabel("Training set size")
    plt.ylabel(scoring)
    plt.title(title or model.__class__.__name__)
    plt.legend()
    plt.grid(True)
    plt.show()


# Training some classifiers

## Decision Tree

### Only preprocessed text

#### TF-IDF

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=10, random_state=42).fit(X_tfidf, y_train)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_tfidf,
    y = y_train,
    scoring="f1",
    title="DecisionTreeClassifier - TF-IDF"
)


#### BoW

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=10, random_state=42).fit(X_bow, y_train)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test, y_hat))

#### Plotting Learning Curves

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_bow,
    y = y_train,
    scoring="f1",
    title="Passive Aggressive – TF-IDF"
)

### Preprocessed text + noise removal

#### TF-IDF

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_filt)
X_test_tfidf = vectorizer.transform(X_test_filt)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=10, random_state=42).fit(X_tfidf, y_train)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_tfidf,
    y = y_train,
    scoring="f1",
    title="Passive Aggressive – TF-IDF"
)

#### BoW

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_filt)
X_test_bow = vectorizer.transform(X_test_filt)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=10, random_state=42).fit(X_bow, y_train_filt)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_filt, y_hat))
print("Classification Report:\n", classification_report(y_test_filt, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_filt, y_hat))

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_bow,
    y = y_train_filt,
    scoring="f1",
    title="Passive Aggressive – TF-IDF"
)

### Preprocessed text + snow stemmer

#### TF-IDF

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_snow)
X_test_tfidf = vectorizer.transform(X_test_snow)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=10, random_state=42).fit(X_tfidf, y_train_snow)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_tfidf,
    y = y_train_snow,
    scoring="f1",
    title="Passive Aggressive – TF-IDF"
)

#### BoW

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_snow)
X_test_bow = vectorizer.transform(X_test_snow)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth= 10, random_state=42).fit(X_bow, y_train_snow)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_snow, y_hat))
print("Classification Report:\n", classification_report(y_test_snow, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_snow, y_hat))

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_bow,
    y = y_train_snow,
    scoring="f1",
    title="Decision Tree Classifier – BoW"
)

### Preprocessed text + porter stemmer

#### TF-IDF

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_porter)
X_test_tfidf = vectorizer.transform(X_test_porter)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=10, random_state=42).fit(X_tfidf, y_train_porter)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_tfidf,
    y = y_train_porter,
    scoring="f1",
    title="Decision Tree Classifier – TF-IDF"
)

#### BoW

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_porter)
X_test_bow = vectorizer.transform(X_test_porter)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=10, random_state=42).fit(X_bow, y_train_porter)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_porter, y_hat))
print("Classification Report:\n", classification_report(y_test_porter, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_porter, y_hat))

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_bow,
    y = y_train_porter,
    scoring="f1",
    title="Decision Tree Classifier – BoW"
)

### Preprocessed text + porter lemmatizer

#### TF-IDF

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_porter)
X_test_tfidf = vectorizer.transform(X_test_porter)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=10, random_state=42).fit(X_tfidf, y_train_lemm)
y_hat = dt_classifier.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_tfidf,
    y = y_train_lemm,
    scoring="f1",
    title="Decision Tree Classifier – TF-IDF"
)

#### BoW

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_lemm)
X_test_bow = vectorizer.transform(X_test_lemm)

##### Decision Tree metrics

In [None]:
dt_classifier = DecisionTreeClassifier(max_depth=10, random_state=42).fit(X_bow, y_train_lemm)
y_hat = dt_classifier.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

In [None]:
plot_learning_curve(
    model = dt_classifier,
    X = X_bow,
    y = y_train_lemm,
    scoring="f1",
    title="Decision Tree Classifier – BoW"
)

## Logistic Regression

BoW

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_lemm)
X_test_bow = vectorizer.transform(X_test_lemm)

In [None]:
log_reg = LogisticRegression(random_state=42).fit(X_bow, y_train_lemm)
y_hat = log_reg.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

In [None]:
plot_learning_curve(
    model = log_reg,
    X = X_bow,
    y = y_train_lemm,
    scoring="f1",
    title="Logistic Regression - BoW"
)

TF-IDF

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_lemm)
X_test_tfidf = vectorizer.transform(X_test_lemm)

In [None]:
log_reg = LogisticRegression(random_state=42).fit(X_tfidf, y_train_lemm)
y_hat = log_reg.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

In [None]:
plot_learning_curve(
    model = log_reg,
    X = X_tfidf,
    y = y_train_lemm,
    scoring="f1",
    title="Logistic Regression - BoW"
)

Lemmatization (Verb)

BoW

In [None]:
vectorizer = CountVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_bow = vectorizer.fit_transform(X_train_lemm_verb)
X_test_bow = vectorizer.transform(X_test_lemm_verb)

In [None]:
log_reg = LogisticRegression(random_state=42).fit(X_bow, y_train_lemm_verb)
y_hat = log_reg.predict(X_test_bow)

print("Accuracy:", accuracy_score(y_test_lemm_verb, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm_verb, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm_verb, y_hat))

In [None]:
plot_learning_curve(
    model= log_reg,
    X= X_bow,
    y= y_train_lemm_verb,
    scoring='f1',
    title="LogisticRegression - BoW"
)

TF-IDF

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False
)

X_tfidf = vectorizer.fit_transform(X_train_lemm_verb)
X_test_tfidf = vectorizer.transform(X_test_lemm_verb)

In [None]:
log_reg = LogisticRegression(random_state=42).fit(X_tfidf, y_train_lemm_verb)
y_hat = log_reg.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_lemm_verb, y_hat))
print("Classification Report:\n", classification_report(y_test_lemm_verb, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm_verb, y_hat))

In [None]:
plot_learning_curve(
    model = log_reg,
    X = X_tfidf,
    y = y_train_lemm_verb,
    scoring="f1",
    title="Logistic Regression - TF-IDF"
)

## Bigrams

TF-IDF

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    token_pattern=None,
    lowercase=False,
    # create word unigrams + bigrams
    ngram_range=(1, 2),
    # drops rare tokens/bigrams that appear once
    min_df=2,
    # drops near-constant boilerplate tokens
    max_df=0.9
)

X_tfidf = vectorizer.fit_transform(X_train_porter)
X_test_tfidf = vectorizer.transform(X_test_porter)

In [None]:
log_reg = LogisticRegression(random_state=42).fit(X_tfidf, y_train_lemm)
y_hat = log_reg.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_lemm, y_hat))
print("Classification Report:\n", classification_report(y_test, y_hat))
print('Confusion matrix:\n', confusion_matrix(y_test_lemm, y_hat))

Top Weighted Features

In [None]:
feature_names = vectorizer.get_feature_names_out()
coefs = log_reg.coef_[0]

top_fake = sorted(zip(feature_names, coefs), key=lambda x: x[1], reverse=True)[:20]
top_real = sorted(zip(feature_names, coefs), key=lambda x: x[1])[:20]

top_real


In [None]:
plot_learning_curve(
    model = log_reg,
    X = X_tfidf,
    y = y_train_lemm,
    scoring="f1",
    title="Logistic Regression - TF-IDF (Unigrams + Bigrams)"
)