In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report
import joblib
import numpy as np
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [3]:
# Load the dataset
data = pd.read_csv("/content/drive/MyDrive/LUT/ISS/fakenewscorpus/preprocessed_data.csv")

# Splitting into X (features) and y (target)
X = data['content']
y = data['label']

In [4]:
# Data Splitting to Train, Validation and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [5]:
# TF-IDF Vectorization
def custom_tokenizer(text):
    # convert to lowercase, remove punctuation and numeric data, then tokenize
    text = text.lower()
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    tokens = word_tokenize(text)
    return tokens


vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=3000, norm="l2")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")



['tfidf_vectorizer.pkl']

In [None]:
# Define MLPClassifier model
mlp_classifier = MLPClassifier(hidden_layer_sizes=(512, 256, 128),
                               activation='relu',
                               alpha=0.001,  # L2 regularization parameter
                               solver='adam',
                               batch_size=32,
                               learning_rate='adaptive',
                               max_iter=10,
                               random_state=42,
                               early_stopping=True,  # Enable early stopping to prevent overfitting
                               validation_fraction=0.1,  # Validation set size for early stopping
                               n_iter_no_change=10,  # Number of epochs with no improvement to stop training
                               verbose=True
                               )

# Train the MLPClassifier model
mlp_classifier.fit(X_train_vec, y_train)

Iteration 1, loss = 0.28891862
Validation score: 0.894549
Iteration 2, loss = 0.25752290
Validation score: 0.900491
Iteration 3, loss = 0.24597684
Validation score: 0.897914
Iteration 4, loss = 0.23890883
Validation score: 0.899717


In [None]:
# Predictions with test data
y_pred = mlp_classifier.predict(X_test_vec)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)
print(class_report)

# Visualize confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Save metrics in a text file
with open("MLP-Report.txt", "w") as file:
    file.write("Accuracy: {:.4f}\n".format(accuracy))
    file.write("Precision: {:.4f}\n".format(precision))
    file.write("Recall: {:.4f}\n".format(recall))
    file.write("F1-score: {:.4f}\n".format(f1))
    file.write("\nClassification Report:\n")
    file.write(class_report)
    file.write("\nConfusion Matrix:\n")
    file.write(np.array2string(conf_matrix, separator=', '))

# Save the trained model
joblib.dump(mlp_classifier, 'mlp_model.pkl')

Validation with Cross-Domain LIAR Data

In [None]:
# Load the LIAR Cross-Domain Test data
test_df = pd.read_csv("/content/drive/MyDrive/LUT/ISS/cross-domain-data-liar/labeled-strictness-high-test.tsv", sep='\t', header=None)

# Extract news (X) and labels (y) from the test data
X = test_df[2]  # News Content in column 2
y = test_df[14]  # Label in column 15

# Load the Vectorizer
def custom_tokenizer(text):
    # convert to lowercase, remove punctuation and numeric data, then tokenize
    text = text.lower()
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    tokens = word_tokenize(text)
    return tokens


vectorizer = joblib.load("tfidf_vectorizer.pkl")
X_test_vec = vectorizer.fit_transform(X_test)

# Load the trained model
mlp_classifier = joblib.load('mlp_model.pkl')

# Make predictions with the trained MLPClassifier model
y_pred = mlp_classifier.predict(X_test_vec)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)
print(class_report)

# Visualize confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Save metrics in a text file
with open("MLP-CrossDomain-Report", "w") as file:
    file.write("Accuracy: {:.4f}\n".format(accuracy))
    file.write("Precision: {:.4f}\n".format(precision))
    file.write("Recall: {:.4f}\n".format(recall))
    file.write("F1-score: {:.4f}\n".format(f1))
    file.write("\nClassification Report:\n")
    file.write(class_report)
    file.write("\nConfusion Matrix:\n")
    file.write(np.array2string(conf_matrix, separator=', '))
