# Fake And Real News Detection With Desicion Tree

In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report   
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score 
import pickle
from dotenv import load_dotenv
import os
import mlflow


In [2]:
load_dotenv("../../.env")

mlflow.set_tracking_uri(os.environ.get("MLFLOW_SERVER"))
mlflow.sklearn.autolog(log_input_examples=True, extra_tags={"Model": "Desicion Tree"})

In [3]:
# importing data

x_train_path = '../data/x_train.pkl'
x_test_path = '../data/x_test.pkl'
y_train_path = '../data/y_train.pkl'
y_test_path = '../data/y_test.pkl'

# Reading the pickle files
with open(x_train_path, 'rb') as file:
    x_train = pickle.load(file)

with open(x_test_path, 'rb') as file:
    x_test = pickle.load(file)

with open(y_train_path, 'rb') as file:
    y_train = pickle.load(file)

with open(y_test_path, 'rb') as file:
    y_test = pickle.load(file)





In [4]:
# Vectraizing the text data

vect = TfidfVectorizer(stop_words='english', max_df=0.7)
xv_train = vect.fit_transform(x_train)
xv_test = vect.transform(x_test)


In [5]:
model = DecisionTreeClassifier()
model.fit(xv_train, y_train)


2024/05/05 16:07:28 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5af3146537924871b38de5abc7ba1231', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [6]:
# Predicting the model
y_pred = model.predict(xv_test)
# log these values too with mlflow
mlflow.log_metric("testing_accuracy", model.score(xv_test, y_test))
mlflow.log_metric("testing_f1", f1_score(y_test, y_pred, average='weighted'))
mlflow.log_metric("testing_recall", recall_score(y_test, y_pred, average='weighted'))
mlflow.log_metric("testing_precision", precision_score(y_test, y_pred, average='weighted'))

print("Accuracy of the model: ", model.score(xv_test, y_test))
print("F1 Score of the model: ", f1_score(y_test, y_pred, average='weighted'))
print("Recall of the model: ", recall_score(y_test, y_pred, average='weighted'))
print("Precision of the model: ", precision_score(y_test, y_pred, average='weighted'))


model.score(xv_test, y_test)



Accuracy of the model:  0.7200769829245321
F1 Score of the model:  0.7201077789757181
Recall of the model:  0.7200769829245321
Precision of the model:  0.7207090709649152


In [7]:
# make a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# plot the confusion matrix
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')


# show to accuracy
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))




Accuracy: 0.7200769829245321
              precision    recall  f1-score   support

           0       0.70      0.73      0.72     11900
           1       0.74      0.71      0.72     12521

    accuracy                           0.72     24421
   macro avg       0.72      0.72      0.72     24421
weighted avg       0.72      0.72      0.72     24421

