# Fake And Real News Detection With Desicion Tree

In [12]:
# Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report   
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score 
import pickle
from dotenv import load_dotenv
import os
import mlflow
from codecarbon import EmissionsTracker
import torch


In [13]:
tracker = EmissionsTracker(project_name="Decision_tree", measure_power_secs=10)
tracker.start_task("Decision_tree")
load_dotenv("../../.env")

mlflow.set_tracking_uri(os.environ.get("MLFLOW_SERVER"))
mlflow.sklearn.autolog(log_input_examples=True, extra_tags={"Model": "Desicion Tree"}, log_model_signatures=False)

[codecarbon INFO @ 15:58:24] [setup] RAM Tracking...


[codecarbon INFO @ 15:58:24] [setup] GPU Tracking...
[codecarbon INFO @ 15:58:24] No GPU found.
[codecarbon INFO @ 15:58:24] [setup] CPU Tracking...
[codecarbon INFO @ 15:58:26] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-7500U CPU @ 2.70GHz
[codecarbon INFO @ 15:58:26] >>> Tracker's metadata:
[codecarbon INFO @ 15:58:26]   Platform system: Linux-5.4.0-182-generic-x86_64-with-glibc2.31
[codecarbon INFO @ 15:58:26]   Python version: 3.11.7
[codecarbon INFO @ 15:58:26]   CodeCarbon version: 2.4.2
[codecarbon INFO @ 15:58:26]   Available RAM : 7.637 GB
[codecarbon INFO @ 15:58:26]   CPU count: 4
[codecarbon INFO @ 15:58:26]   CPU model: Intel(R) Core(TM) i7-7500U CPU @ 2.70GHz
[codecarbon INFO @ 15:58:26]   GPU count: None
[codecarbon INFO @ 15:58:26]   GPU model: None


In [14]:
# importing data

x_train_path = '../data/x_train.pkl'
x_test_path = '../data/x_test.pkl'
y_train_path = '../data/y_train.pkl'
y_test_path = '../data/y_test.pkl'

# Reading the pickle files
with open(x_train_path, 'rb') as file:
    x_train = pickle.load(file)

with open(x_test_path, 'rb') as file:
    x_test = pickle.load(file)

with open(y_train_path, 'rb') as file:
    y_train = pickle.load(file)

with open(y_test_path, 'rb') as file:
    y_test = pickle.load(file)

x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()



In [15]:
# Vectraizing the text data
vect = TfidfVectorizer(stop_words='english', max_df=0.7)
xv_train = vect.fit_transform(x_train)
xv_test = vect.transform(x_test)

In [16]:
model = DecisionTreeClassifier()
# print(xv_train.flags)
model.fit(xv_train, y_train)



ValueError: Number of labels=6905 does not match number of samples=337585450

In [6]:
# Predicting the model
y_pred = model.predict(xv_test)
# log these values too with mlflow
mlflow.log_metric("testing_accuracy", model.score(xv_test, y_test))
mlflow.log_metric("testing_f1", f1_score(y_test, y_pred, average='weighted'))
mlflow.log_metric("testing_recall", recall_score(y_test, y_pred, average='weighted'))
mlflow.log_metric("testing_precision", precision_score(y_test, y_pred, average='weighted'))

print("Accuracy of the model: ", model.score(xv_test, y_test))
print("F1 Score of the model: ", f1_score(y_test, y_pred, average='weighted'))
print("Recall of the model: ", recall_score(y_test, y_pred, average='weighted'))
print("Precision of the model: ", precision_score(y_test, y_pred, average='weighted'))

# model.add_flavor("pytorch", foo=123, bar="abc", offset=model)
# model.save(".")

model.score(xv_test, y_test)



Accuracy of the model:  0.9993243243243243
F1 Score of the model:  0.9993243107395774
Recall of the model:  0.9993243243243243
Precision of the model:  0.9993252104563579


0.9993243243243243

In [11]:
# make a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# plot the confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')


# show to accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
mlflow.sklearn.save_model(model, "model")

  # Convert to a scripted model and save it
# scripted_pytorch_model = torch.jit.script(model)
# mlflow.pytorch.save_model(scripted_pytorch_model, "scripted_model")


time = tracker.stop()
time


Accuracy: 0.9993243243243243
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1437
           1       1.00      1.00      1.00      1523

    accuracy                           1.00      2960
   macro avg       1.00      1.00      1.00      2960
weighted avg       1.00      1.00      1.00      2960



[codecarbon INFO @ 14:49:43] Energy consumed for RAM : 0.000529 kWh. RAM Power : 2.863922595977783 W
[codecarbon INFO @ 14:49:43] Energy consumed for all CPUs : 0.001386 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 14:49:43] 0.001915 kWh of electricity used since the beginning.
[codecarbon INFO @ 14:49:43] Energy consumed for RAM : 0.000529 kWh. RAM Power : 2.863922595977783 W
[codecarbon INFO @ 14:49:44] Energy consumed for all CPUs : 0.001386 kWh. Total CPU Power : 7.5 W
[codecarbon INFO @ 14:49:44] 0.001915 kWh of electricity used since the beginning.
  df = pd.concat(


0.0007135828976375454