<a href="https://colab.research.google.com/github/mehdimerbah/COVID19_fake_news_detection/blob/main/models/LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library and Data Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
import json


In [None]:
training = pd.read_csv('https://raw.githubusercontent.com/mehdimerbah/COVID19_fake_news_detection/main/preprocessing/processed_training_data.csv')
validation = pd.read_csv('https://raw.githubusercontent.com/mehdimerbah/COVID19_fake_news_detection/main/preprocessing/processed_validation_data.csv')
testing = pd.read_csv('https://raw.githubusercontent.com/mehdimerbah/COVID19_fake_news_detection/main/preprocessing/processed_testing_data.csv')

# Feature Extraction
Since we already detailed the feature extraction process in SVM I will simply define and run the pipeline here.

In [None]:
pipeline = Pipeline([
        ('count_vectorizer', CountVectorizer()),  
        ('tfidf_transformer', TfidfTransformer()),  
        ('classifier', LogisticRegression())
    ])

In [None]:
pipeline.fit(training['tweet'], training['label'])

Pipeline(steps=[('count_vectorizer', CountVectorizer()),
                ('tfidf_transformer', TfidfTransformer()),
                ('classifier', LogisticRegression())])

# Making Predictions

In [None]:
predictions = pipeline.predict(validation['tweet'])
predictions

array(['fake', 'real', 'fake', ..., 'fake', 'fake', 'real'], dtype=object)

# Results
extract the results from our predictions by contrasting with the validation set.

In [None]:
print(classification_report(predictions, validation['label']))

              precision    recall  f1-score   support

        fake       0.94      0.91      0.93      1053
        real       0.92      0.94      0.93      1087

    accuracy                           0.93      2140
   macro avg       0.93      0.93      0.93      2140
weighted avg       0.93      0.93      0.93      2140



In [None]:
def get_metrics(predicted,true):
    metrics = dict()
    metrics['accuracy'] = round(accuracy_score(predicted, true), 5)
    metrics['precision'] = round(precision_score(predicted, true, average = 'weighted'), 5)
    metrics['recall'] = round(recall_score(predicted, true, average = 'weighted'), 5)
    metrics['f1'] = round(f1_score(predicted, true, average = 'weighted'), 5)
    
    return metrics

In [None]:
metrics = get_metrics(predictions, validation['label'])
print(metrics)

{'accuracy': 0.92757, 'precision': 0.92794, 'recall': 0.92757, 'f1': 0.92754}


In [None]:
with open("LR_results.json", "w") as output:
    json.dump(metrics, output)