# MLP Classifier

In [None]:
import pandas as pd

# load data
src = 'data/training_data_features.csv'
training_data = pd.read_csv(src, index_col=0)

src = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src, index_col=0)

src = 'data/test_data_features.csv'
test_data = pd.read_csv(src, index_col=0)

### MLP Classifier with TF-IDF - Grid Search and Cross Validation optimized

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import joblib

### --- Uncomment code below to run --- ###

# # TF-IDF Vectorization
# tfidf_vectorizer = TfidfVectorizer()
# tfidf_training_matrix = tfidf_vectorizer.fit_transform(training_data['content_stem'])
# tfidf_validation_matrix = tfidf_vectorizer.transform(validation_data['content_stem'])

# # Target labels
# y_training_data = training_data['reliable']
# y_validation_data = validation_data['reliable']

# # Define the parameter grid for GridSearchCV
# param_grid = {
#     'hidden_layer_sizes': [(10,), (20,), (10,10), (20,20)],
#     'learning_rate_init': [0.001, 0.01]
# }

# # Initialize the MLPClassifier (Neural Network)
# mlp_model = MLPClassifier(max_iter=500, random_state=42, early_stopping=True, verbose=True)

# # Initialize the GridSearchCV object
# grid_search = GridSearchCV(mlp_model, param_grid=param_grid, cv=3)

# # Fit the GridSearchCV object
# grid_search.fit(tfidf_training_matrix, y_training_data)

# # Access the best hyperparameters
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)

# # Access the best model
# best_model = grid_search.best_estimator_

# # Make predictions on the validation data
# predictions = best_model.predict(tfidf_validation_matrix)

# print("BEST MLP CLASSIFIER w/ TF-IDF")
# # Evaluate the MLPClassifier
# print(classification_report(y_validation_data, predictions))

# # save trained model as persistance
# joblib.dump(best_model, 'pickle/best_mlp_model_tfidf.joblib')

### MLP Classifier with TF-IDF (with optimal parameters)

In [None]:
import joblib

# load optimized trained model
best_model = joblib.load('pickle/best_mlp_model_tfidf.joblib')

# get parameters
best_model.get_params()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_training_matrix = tfidf_vectorizer.fit_transform(training_data['content_stem'])
tfidf_validation_matrix = tfidf_vectorizer.transform(validation_data['content_stem'])

# Target labels
y_training_data = training_data['reliable']
y_validation_data = validation_data['reliable']

# # Initialize the MLPClassifier (Neural Network)
# mlp_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=500, random_state=42, early_stopping=True, verbose=True)

# # Fit the MLPClassifier using TF-IDF features
# mlp_model.fit(tfidf_training_matrix, y_training_data)

# Make predictions on the validation data
mlp_model = best_model
predictions = mlp_model.predict(tfidf_validation_matrix)

# Evaluate the MLPClassifier
print("MLP CLASSIFIER w/ TF-IDF")
print(classification_report(y_validation_data, predictions))

In [None]:
import joblib

# Save the trained TF-IDF vectorizer and MLPClassifier
joblib.dump(tfidf_vectorizer, 'pickle/tfidf_vectorizer_mlp.joblib')
joblib.dump(mlp_model, 'pickle/mlp_model.joblib')

##### Test on Fake News

In [None]:
tfidf_vectorizer = joblib.load('pickle/tfidf_vectorizer_mlp.joblib')
mlp_model = joblib.load('pickle/best_mlp_model_tfidf.joblib')

# TF-IDF transformation on test data
tfidf_test_matrix = tfidf_vectorizer.transform(test_data['content_stem'])

# Make predictions on the test data
test_predictions = mlp_model.predict(tfidf_test_matrix)

# Evaluate the MLPClassifier
print("MLP Classifier w/ TF-IDF on Fake News")
print(classification_report(test_data['reliable'], test_predictions))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(test_data['reliable'],test_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('MLP CLASSIFIER w/ TF-IDF on Fake News')
plt.show()

##### Test on LIAR

In [None]:
src = 'data/liar_dataset/test_features.csv'
test_data = pd.read_csv(src, index_col=0)

# Load the saved vectorizer and model when needed
tfidf_vectorizer = joblib.load('pickle/tfidf_vectorizer_mlp.joblib')
mlp_model = joblib.load('pickle/mlp_model.joblib')

# TF-IDF transformation on test data
tfidf_test_matrix = tfidf_vectorizer.transform(test_data['content_stem'])

# Make predictions on the test data
test_predictions = mlp_model.predict(tfidf_test_matrix)

# Evaluate the MLPClassifier
print("MLP CLASSIFIER w/ TF-IDF on LIAR")
print(classification_report(test_data['reliable'], test_predictions))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(test_data['reliable'],test_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('MLP CLASSIFIER w/ TF-IDF on LIAR')
plt.show()

### MLP Classifier with Transformer

In [None]:
import pandas as pd

# load data
# training data
src = 'data/training_data_embeddings.csv'
training_embeddings = pd.read_csv(src)

src = 'data/training_data_features.csv'
training_data = pd.read_csv(src)

# validation data
src = 'data/validation_data_embeddings.csv'
validation_embeddings = pd.read_csv(src)

src = 'data/validation_data_features.csv'
validation_data = pd.read_csv(src)

# test data
src = 'data/test_data_features.csv'
test_data = pd.read_csv(src)

src = 'data/test_data_embeddings.csv'
test_embeddings = pd.read_csv(src)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Scale input features
scaler = StandardScaler(with_mean=False)  # Pass with_mean=False for sparse matrices
X_training_scaled = scaler.fit_transform(training_embeddings)
X_validation_scaled = scaler.transform(validation_embeddings)

y_train = training_data['reliable']
y_val = validation_data['reliable']

# MLP model with 1 hidden layer and 10 neurones, with the default rectified linear unit function.
mlp_model = MLPClassifier(hidden_layer_sizes=(10), max_iter=500, random_state=42, batch_size='auto', early_stopping=True, verbose=True)
mlp_model.fit(X_training_scaled, y_train)

predictions = mlp_model.predict(X_validation_scaled)

print("MLP CLASSIFIER w/ TRANSFORMER")
print(classification_report(y_val, predictions))

In [None]:
import joblib

# Save the trained scaler and MLPClassifier
joblib.dump(scaler, 'pickle/scaler_mlp.joblib')
joblib.dump(mlp_model, 'pickle/mlp_model_transformer.joblib')

##### Test on Fake News

In [None]:
# Load the saved scaler and MLPClassifier when needed
scaler = joblib.load('pickle/scaler_mlp.joblib')
mlp_model = joblib.load('pickle/mlp_model_transformer.joblib')

# Scale test data
X_test_scaled = scaler.transform(test_embeddings)
y_test = test_data['reliable']
# Make predictions on the test data
mlp_predictions = mlp_model.predict(X_test_scaled)

# Evaluate the MLPClassifier
print("MLP CLASSIFIER w/ TRANSFORMER on Fake News")
print(classification_report(y_test, mlp_predictions))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, mlp_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot(values_format='')
plt.title('MLP CLASSIFIER w/ TRANSFORMER on Fake News')
plt.show()

##### Test on LIAR

In [None]:
# test data
src = 'data/liar_dataset/test_features.csv'
test_data = pd.read_csv(src)

src = 'data/liar_dataset/test_embeddings.csv'
test_embeddings = pd.read_csv(src)

# Load the saved scaler and MLPClassifier when needed
scaler = joblib.load('pickle/scaler_mlp.joblib')
mlp_model = joblib.load('pickle/mlp_model_transformer.joblib')

# Scale test data
X_test_scaled = scaler.transform(test_embeddings)
y_test = test_data['reliable']
# Make predictions on the test data
mlp_predictions = mlp_model.predict(X_test_scaled)

# Evaluate the MLPClassifier
print("MLP CLASSIFIER w/ TRANSFORMER on LIAR")
print(classification_report(y_test, mlp_predictions))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, mlp_predictions)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('MLP CLASSIFIER w/ TRANSFORMER on LIAR')
plt.show()