# Fake And Real News Detection With Logistic Regression

In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report   
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from dotenv import load_dotenv
import os
import mlflow
from codecarbon import EmissionsTracker

tracker = EmissionsTracker(project_name="Logistic_regression", measure_power_secs=10)

[codecarbon INFO @ 12:21:45] [setup] RAM Tracking...
[codecarbon INFO @ 12:21:45] [setup] GPU Tracking...
[codecarbon INFO @ 12:21:45] No GPU found.
[codecarbon INFO @ 12:21:45] [setup] CPU Tracking...
[codecarbon INFO @ 12:21:46] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-7500U CPU @ 2.70GHz
[codecarbon INFO @ 12:21:46] >>> Tracker's metadata:
[codecarbon INFO @ 12:21:46]   Platform system: Linux-5.4.0-182-generic-x86_64-with-glibc2.31
[codecarbon INFO @ 12:21:46]   Python version: 3.11.7
[codecarbon INFO @ 12:21:46]   CodeCarbon version: 2.4.2
[codecarbon INFO @ 12:21:46]   Available RAM : 7.637 GB
[codecarbon INFO @ 12:21:46]   CPU count: 4
[codecarbon INFO @ 12:21:46]   CPU model: Intel(R) Core(TM) i7-7500U CPU @ 2.70GHz
[codecarbon INFO @ 12:21:46]   GPU count: None
[codecarbon INFO @ 12:21:46]   GPU model: None


In [2]:
tracker.start_task("load dataset")
load_dotenv("../../.env")

mlflow.set_tracking_uri(os.environ.get("MLFLOW_SERVER"))
mlflow.sklearn.autolog(log_input_examples=True, extra_tags={"Model": "Logistic Regression"})



In [3]:
# importing data

x_train_path = '../data/x_train.pkl'
x_test_path = '../data/x_test.pkl'
y_train_path = '../data/y_train.pkl'
y_test_path = '../data/y_test.pkl'

# Reading the pickle files
with open(x_train_path, 'rb') as file:
    x_train = pickle.load(file)

with open(x_test_path, 'rb') as file:
    x_test = pickle.load(file)

with open(y_train_path, 'rb') as file:
    y_train = pickle.load(file)

with open(y_test_path, 'rb') as file:
    y_test = pickle.load(file)

x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [4]:
# Vectraizing the text data
tracker.stop()
tracker.start_task("teach model")

vect = TfidfVectorizer()
xv_train = vect.fit_transform(x_train)
xv_test = vect.transform(x_test)


[codecarbon INFO @ 12:21:56] Energy consumed for RAM : 0.000002 kWh. RAM Power : 2.863922595977783 W
[codecarbon INFO @ 12:21:56] Energy consumed for all CPUs : 0.000005 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 12:21:56] 0.000007 kWh of electricity used since the beginning.
[codecarbon INFO @ 12:21:56] Energy consumed for RAM : 0.000002 kWh. RAM Power : 2.863922595977783 W
[codecarbon INFO @ 12:21:56] Energy consumed for all CPUs : 0.000005 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 12:21:56] 0.000007 kWh of electricity used since the beginning.
  df = pd.concat(


In [5]:
model = LogisticRegression()
model.fit(xv_train, y_train)

2024/06/07 12:22:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '92056efdb6aa421f995f68b0bbcc6540', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [6]:
# Predicting the model

tracker.stop()
tracker.start_task("predict")

y_pred = model.predict(xv_test)
# log these values too with mlflow
mlflow.log_metric("testing_accuracy", model.score(xv_test, y_test))
mlflow.log_metric("testing_f1", f1_score(y_test, y_pred, average='weighted'))
mlflow.log_metric("testing_recall", recall_score(y_test, y_pred, average='weighted'))
mlflow.log_metric("testing_precision", precision_score(y_test, y_pred, average='weighted'))

print("Accuracy of the model: ", model.score(xv_test, y_test))
print("F1 Score of the model: ", f1_score(y_test, y_pred, average='weighted'))
print("Recall of the model: ", recall_score(y_test, y_pred, average='weighted'))
print("Precision of the model: ", precision_score(y_test, y_pred, average='weighted'))



[codecarbon INFO @ 12:24:13] Energy consumed for RAM : 0.000111 kWh. RAM Power : 2.863922595977783 W
[codecarbon INFO @ 12:24:13] Energy consumed for all CPUs : 0.000291 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 12:24:13] 0.000401 kWh of electricity used since the beginning.
[codecarbon INFO @ 12:24:13] Energy consumed for RAM : 0.000111 kWh. RAM Power : 2.863922595977783 W
[codecarbon INFO @ 12:24:13] Energy consumed for all CPUs : 0.000291 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 12:24:13] 0.000401 kWh of electricity used since the beginning.
  df = pd.concat(


Accuracy of the model:  0.9891891891891892
F1 Score of the model:  0.9891901275607481
Recall of the model:  0.9891891891891892
Precision of the model:  0.9892136569635565


In [7]:
# make a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# plot the confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')


# show to accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
tracker.stop()


[codecarbon INFO @ 12:24:16] Energy consumed for RAM : 0.000114 kWh. RAM Power : 2.863922595977783 W
[codecarbon INFO @ 12:24:16] Energy consumed for all CPUs : 0.000298 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 12:24:16] 0.000412 kWh of electricity used since the beginning.
[codecarbon INFO @ 12:24:16] Energy consumed for RAM : 0.000114 kWh. RAM Power : 2.863922595977783 W
[codecarbon INFO @ 12:24:16] Energy consumed for all CPUs : 0.000298 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 12:24:16] 0.000412 kWh of electricity used since the beginning.


Accuracy: 0.9891891891891892
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1437
           1       0.99      0.99      0.99      1523

    accuracy                           0.99      2960
   macro avg       0.99      0.99      0.99      2960
weighted avg       0.99      0.99      0.99      2960



  df = pd.concat(


0.00015342269792012624