## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
print(pd.__version__)
print(np.__version__)

## Importing dataset

In [None]:
df1 = pd.read_csv('../data/Fake.csv')
df2 = pd.read_csv('../data/True.csv')

df1['label'] = 0
df2['label'] = 1

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
df = pd.concat([df1, df2], axis=0)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df['content'] = df['title'] + ' ' + df['text']
df = df[['content', 'label']]

In [None]:
df.head()

## Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['content'], df['label'], test_size=0.2, random_state=42
)   

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
 y_test

## Data Preprocessing

In [None]:
# Removes characters that are not useful for text analysis
def remove_unwanted_chars(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # remove URLs
    text = re.sub(r'<.*?>', '', text) # remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text) # remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip() # remove extra spaces
    
    return text 

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Removes stopwords and lemmatizes the text
def lemmatize_text(text):
    doc = nlp(text)
    tokens = [
        token.lemma_
        for token in doc
        if not token.is_stop
    ]
    return ' '.join(tokens)

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = remove_unwanted_chars(text)
    text = lemmatize_text(text)
    return text

## Defining the complete ML pipeline

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [None]:
# define the complete ML pipeline steps for this baseline model
pipeline = Pipeline([
    (
        'tfidf', 
        TfidfVectorizer(
            preprocessor=preprocess_text,
            max_df=0.9,
            min_df=5,
            ngram_range=(1, 2)
        )
    ),
    (
        'clf', 
        LogisticRegression(
            solver='liblinear', 
            max_iter=1000,
            random_state=42
        )
    ),
])

## Training the Model

In [None]:
pipeline.fit(X_train, y_train)

## Predicting results on test set

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
print(np.concatenate([y_test.values.reshape(-1,1), y_pred.reshape(-1,1)], axis=1))

## Evalution

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred, target_names=['Fake', 'Real'])
print(report)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Fake', 'Real'])
disp.plot(cmap='Blues')

## Explainability

In [None]:
feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()
coefficients = pipeline.named_steps['clf'].coef_[0]

In [None]:
feature_names

In [None]:
coefficients

In [None]:
coef_df = pd.DataFrame({
    'word': feature_names,
    'weight': coefficients
})

In [None]:
top_fake = coef_df.sort_values(by='weight', ascending=True).head(20)
top_real = coef_df.sort_values(by='weight', ascending=False).head(20)

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.barh(top_fake['word'], top_fake['weight'])
plt.title("Top words indicating FAKE news")
plt.show()

plt.figure()
plt.barh(top_real['word'], top_real['weight'])
plt.title("Top words indicating REAL news")
plt.show()


In [None]:
text = X_test.iloc[0]

vectorizer = pipeline.named_steps['tfidf']
model = pipeline.named_steps['clf']

X_vec = vectorizer.transform([text])
feature_index = X_vec.nonzero()[1]

contributions = pd.DataFrame({
    'word': feature_names[feature_index],
    'contribution': coefficients[feature_index] * X_vec.data
}).sort_values(by='contribution', ascending=True)

pred = pipeline.predict([text])[0]


In [None]:
print("The predicted label for the given text is:", "REAL" if pred == 1 else "FAKE", "\n")
print("The real label for the given text is:", "REAL" if y_test.iloc[0] == 1 else "FAKE", "\n")

print("Top words pushing prediction towards FAKE:\n")
print(contributions.head(10), "\n")

print("Top words pushing prediction towards REAL:\n")
print(contributions.tail(10))

## LIME

In [None]:
from lime.lime_text import LimeTextExplainer

class_names = ['FAKE', 'REAL']
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
exp = explainer.explain_instance(
    text,
    pipeline.predict_proba, 
    num_features=10
)

In [None]:
from IPython.display import HTML, display

display(HTML(exp.as_html()))