In [32]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import pickle
import re

Authors: Netanel Shen, Gadi Yohanan

Introduction
This is a a project by Netanel Shen and Gadi Yohannan, where we attempt to train a model to recognize whether a certain article is real of fake.
in the 'dataset' folder we have CSV files with urls leading to real and fake articles (we took the dataset from a github repo).

## Answers

    1. we can ensure the data's quality because it's a known and tested repository, because it's extremly popular we assume it's also credible.

## Step 1: Define Functions

We start by defining two functions:
1. `fetch_article_content(url)`: This function fetches the content of an article from a given URL.
2. `process_articles(file_path, output_folder, max_articles)`: This function processes articles from a CSV file and stores their content in the specified output folder.

In [33]:
# Fetch article content function
def fetch_article_content(url):
    """Fetch article content from a URL."""
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            paragraphs = soup.find_all('p')
            content = ' '.join([para.get_text() for para in paragraphs])
            return content
        else:
            return None
    except requests.RequestException:
        return None



In [34]:
def process_articles(file_path, output_folder, max_articles):
    """Process articles from a CSV file and store content."""
    df = pd.read_csv(file_path)
    if 'news_url' not in df.columns:
        print(f"No 'news_url' column in {file_path}")
        return

    # Determine if the article is fake or real based on the filename
    category = 'fake' if 'fake' in file_path.lower() else 'real'
    output_folder = os.path.join(output_folder, category)

    os.makedirs(output_folder, exist_ok=True)

    count = 0
    for idx, row in df.iterrows():
        if count >= max_articles:
            break

        url = row['news_url']
        content = fetch_article_content(url)
        if content:
            output_file = os.path.join(output_folder, f'article_{os.path.basename(file_path)}_{idx}.txt')
            with open(output_file, 'w', encoding='utf-8') as file:
                file.write(content)
            print(f"Processed article {idx} from {file_path} into {category} category")
            count += 1
        else:
            print(f"Failed to fetch content from {url}")


## Step 2: Define the Article Processing Function

Next, we define the `process_articles` function. This function:
- Reads a CSV file containing URLs of news articles.
- Categorizes the articles as 'fake' or 'real' based on the filename.
- Creates output directories for storing the processed articles.
- Fetches the content of each article and saves it as a text file in the appropriate folder.

## Step 3: Specify Input Files and Output Folder

We now specify the CSV files containing the article URLs and the folder where the processed articles will be saved.


In [35]:
csv_files = [
    'dataset/gossipcop_fake.csv',
    'dataset/gossipcop_real.csv',
    'dataset/politifact_fake.csv',
    'dataset/politifact_real.csv'
]

# Output folder for articles
output_folder = 'cleaned_articles'

# Maximum number of articles to process per file
max_articles = 100


## Step 4: Process Each CSV File

Finally, we process each CSV file using the `process_articles` function. The content of the articles will be fetched, categorized, and saved as text files in the specified output folder.


In [None]:
for csv_file in csv_files:
    process_articles(csv_file, output_folder, max_articles)

## Step 5: Load and clean the data
Next, we load and clean the data from the 'fake' and 'real' article folders. We use a function `clean_text` to remove stopwords, punctuation, and perform lemmatization using spaCy.
## Answers:
    2. in this step, we clean the data.
    * Punctuation Removal - same thing, reduce noise.
    * HTML tags removal - this is irrelevant to our data processing.
    * Special Characters removal - reduce noise
    * We also take care of null / NaN values by deleting them.
    
    3. Lowercasing - consistency in unique tokens.
    4. Lemmatization - reducing words to base form, reduces the number of unique tokens.
    5. Remove stop words is important because it reduces noise in the data, it adds additional data but no real information.


      


In [36]:
nlp = spacy.load('en_core_web_sm')

# Define the paths to the fake and real articles
fake_path = 'cleaned_articles/fake'
real_path = 'cleaned_articles/real'

# Function to clean and lemmatize text using spaCy
def clean_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    cleaned_text = ' '.join(tokens)
    cleaned_text = re.sub(r'<.*?>', '', cleaned_text)  # Remove HTML tags
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)  # Remove special characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra whitespace
    return cleaned_text.strip()

# Load data from folder
def load_data_from_folder(folder, label):
    data = []
    for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            cleaned_text = clean_text(text)
            data.append((cleaned_text, label))
    return data

# Load fake and real data
fake_data = load_data_from_folder(fake_path, 'FAKE')
real_data = load_data_from_folder(real_path, 'REAL')

# Combine data into a single DataFrame
all_data = fake_data + real_data
df = pd.DataFrame(all_data, columns=['text', 'label'])

# Ensure no NaN values are present
df.dropna(subset=['text'], inplace=True)

# Save to a single CSV file
df.to_csv('cleaned_articles.csv', index=False)


## Step 6: Load Combined Data

We save the cleaned data into a CSV file and then load it back into a DataFrame. We ensure there are no NaN values in the text column.


In [37]:
# Load the combined CSV file
df = pd.read_csv('cleaned_articles.csv')

# Ensure no NaN values in the text column
df['text'].fillna('', inplace=True)


## Step 7: Split Data into Training and Test Sets

We split the data into training and test sets. This will allow us to train our models on the training set and evaluate their performance on the test set.


In [38]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)


## Step 8: Define and Train Models

We define several models and train them using a pipeline that includes a TF-IDF vectorizer and the classifier. We evaluate each model and print a classification report.

## Answers
    6. We are using scikit-Learn's pipeline class which allows to combine multiple processing steps. First, we use 'TfidVectorizer' for text vectorization and the classification models. We are converting the text into a numerical format before feeding it to the classifier.
    7. To ensure a fair split, the provided code uses train_test_split from Scikit-Learn to divide the data into training and testing sets. This function randomly splits the data based on the specified test_size parameter (20% for testing in this case) and a random_state to ensure reproducibility. This approach helps to maintain a representative sample in both the training and testing sets, reducing the risk of overfitting and ensuring the model's performance is evaluated on unseen data.
    8. Naive Bayes:
    Advantages: Simple, fast, works well with high-dimensional data, particularly effective for text classification.
    Disadvantages: Assumes independence among features, which is rarely true in practice.
    Random Forest:
    
    Advantages: Robust to overfitting, handles large datasets well, provides feature importance.
    Disadvantages: Can be slow and resource-intensive for large datasets, less interpretable than simpler models.
    Support Vector Machine (SVM):
    
    Advantages: Effective in high-dimensional spaces, works well with clear margin of separation.
    Disadvantages: Computationally expensive, less effective on large datasets, sensitive to the choice of kernel and parameters.
    K-Nearest Neighbors (KNN):
    
    Advantages: Simple, intuitive, no training phase.
    Disadvantages: Computationally expensive during prediction, sensitive to the choice of K and distance metric, struggles with high-dimensional data.
    Logistic Regression:
    
    Advantages: Simple, interpretable, works well for binary classification, less prone to overfitting.
    Disadvantages: Assumes linear relationship between features and the log-odds, less effective with complex relationships.



Answer:
The following metrics were used to evaluate the models' performance:

Precision: Measures the proportion of true positive predictions among all positive predictions made by the model.
Recall: Measures the proportion of true positive predictions among all actual positive instances.
F1 Score: The harmonic mean of Precision and Recall, providing a balance between the two.

In [39]:
# Define the models
models = [
    ('Naive Bayes', MultinomialNB()),
    ('Random Forest', RandomForestClassifier()),
    ('SVM', SVC()),
    ('KNN', KNeighborsClassifier()),
    ('Logistic Regression', LogisticRegression(max_iter=300))
]

# Function to evaluate model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, pos_label='REAL')
    recall = recall_score(y_test, y_pred, pos_label='REAL')
    f1 = f1_score(y_test, y_pred, pos_label='REAL')
    return precision, recall, f1

# Initialize results dictionary
results = {}

# Evaluate each model and store the results
for name, model in models:
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', model),
    ])
    pipeline.fit(X_train, y_train)
    precision, recall, f1 = evaluate_model(pipeline, X_test, y_test)
    results[name] = {'Precision': precision, 'Recall': recall, 'F1-Score': f1}
    y_pred = pipeline.predict(X_test)
    print(f"--- {name} ---")
    print(classification_report(y_test, y_pred))


--- Naive Bayes ---
              precision    recall  f1-score   support

        FAKE       1.00      0.09      0.16        35
        REAL       0.63      1.00      0.77        55

    accuracy                           0.64        90
   macro avg       0.82      0.54      0.47        90
weighted avg       0.78      0.64      0.53        90

--- Random Forest ---
              precision    recall  f1-score   support

        FAKE       1.00      0.31      0.48        35
        REAL       0.70      1.00      0.82        55

    accuracy                           0.73        90
   macro avg       0.85      0.66      0.65        90
weighted avg       0.81      0.73      0.69        90

--- SVM ---
              precision    recall  f1-score   support

        FAKE       1.00      0.11      0.21        35
        REAL       0.64      1.00      0.78        55

    accuracy                           0.66        90
   macro avg       0.82      0.56      0.49        90
weighted avg       0

## Step 9: Display Model Performance

We create a DataFrame to display the performance of each model in terms of precision, recall, and F1-score.

## Answers:
    9. this is also the answer to question 9.


In [40]:
# Create a DataFrame to display the results
results_df = pd.DataFrame(results).T
print(results_df)


                     Precision    Recall  F1-Score
Naive Bayes           0.632184  1.000000  0.774648
Random Forest         0.696203  1.000000  0.820896
SVM                   0.639535  1.000000  0.780142
KNN                   0.670886  0.963636  0.791045
Logistic Regression   0.632184  1.000000  0.774648


## Step 10: Save the Best Model

Answers:
    10. We identify the best performing model to be the one with the highest F1 score - Random Forest. Though SVM and KNN were more consistend and also had pretty good F1 scores. We will save the RandomForest using 'pickle' into the file 'best_model.pkl',


In [41]:
best_model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier()),
])
best_model.fit(X_train, y_train)

# Save the trained model to a file
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

## Step 11: Predict on New Articles

We define a function to clean and lemmatize new articles using spaCy, and then use the saved model to predict whether the new article is fake or real.


In [42]:
# Function to clean and lemmatize text using spaCy for new articles
def clean_text_new_article(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    cleaned_text = ' '.join(tokens)
    cleaned_text = re.sub(r'<.*?>', '', cleaned_text)  # Remove HTML tags
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)  # Remove special characters
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra whitespace
    return cleaned_text.strip()

# Example article for testing
new_article = """
Hillary Clinton had a third and most likely fatal heart attack this afternoon after spending the morning being told she'd be getting better. It almost seems fitting that the last thing she hears is a lie about her own wellbeing. The Butcher of Benghazi will meet her maker, says Doctor Eugene Icsa of Westchester Memorial Hospital in upstate New York, as the damage to her heart is irreparable at this point. Secretary Clinton fought hard, but today her fight is over. We predict she'll be at rest within hours. The Clinton family has asked for privacy and wouldn't answer questions about whether or not they had to decide to pull the plug or if they're simply being told nothing can be done. Chelsea Clinton was seen entering the hospital in tears shortly after noon according to a new report from LLOD correspondent Skip Tetheluda. Chelsea came alone and was obviously distraught. She made no comment to the press but did stop to tell one photographer to have some respect while she visits a great woman for the last time. She hasn't come out and is presumably sitting and waiting to say goodbye. Bill Clinton is sitting on the front porch of the Chappaqua mansion drinking what looks like either tomato juice or a Bloody Mary. We'll update you as soon as we confirm that Clinton has gone on to answer for her crimes with an eternity in Hell.
"""

# Preprocess the new article
cleaned_article = clean_text_new_article(new_article)

# Make prediction using the best model
with open('best_model.pkl', 'rb') as file:
    model = pickle.load(file)

prediction = model.predict([cleaned_article])
print("Prediction for the new article:", "REAL" if prediction[0] == 'REAL' else "FAKE")


Prediction for the new article: REAL
