In [14]:
import pandas as pd
import numpy as np
import re

def preprocess(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation
    text = re.sub('[^a-zA-Z0-9\s]', '', text)

    return text

def train(df):
    # Preprocess the text data
    df['title'] = df['title'].apply(preprocess)
    df['text'] = df['text'].apply(preprocess)
    df['subject'] = df['subject'].apply(preprocess)

    # Create vocabulary
    vocabulary = set()
    for text in df['title']:
        vocabulary.update(text.split())
    for text in df['text']:
        vocabulary.update(text.split())

    # Calculate prior probabilities
    num_real = (df['label'] == 'real').sum()
    num_fake = (df['label'] == 'fake').sum()
    total = num_real + num_fake
    prior_real = num_real / total
    prior_fake = num_fake / total

    # Calculate conditional probabilities
    count_real = np.zeros(len(vocabulary))
    count_fake = np.zeros(len(vocabulary))
    for i, word in enumerate(vocabulary):

        # if i == int(len(vocabulary) / 10000):
        #     break

        print(f"Training {i} out of {len(vocabulary)}")

        for text, label in zip(df['text'], df['label']):
            if word in text.split():
                if label == 'real':
                    count_real[i] += 1
                else:
                    count_fake[i] += 1

    conditional_real = (count_real + 1) / (num_real + 2)
    conditional_fake = (count_fake + 1) / (num_fake + 2)

    # Store model parameters
    model = {
        'vocabulary': sorted(vocabulary),
        'prior_real': prior_real,
        'prior_fake': prior_fake,
        'conditional_real': conditional_real,
        'conditional_fake': conditional_fake
    }

    print("Done Training!")

    return model

def evaluate(df, model):
    # Preprocess the text data
    df['title'] = df['title'].apply(preprocess)
    df['text'] = df['text'].apply(preprocess)
    df['subject'] = df['subject'].apply(preprocess)

    # Create document-term matrix
    X = np.zeros((len(df), len(model['vocabulary'])))
    for i, text in enumerate(df['text']):
        
        # if i == 10:
        #     break
        print(f"Matrix {i} out of {len(df['text'])}")

        for j, word in enumerate(model['vocabulary']):
            X[i,j] = text.split().count(word)

    # Make predictions
    y_true = (df['label'] == 'real').astype(int)
    y_pred = []
    for i in range(len(df)):

        # if i == 10:
        #     break
        print(f"Evaluating {i} out of {len(df)}")

        log_prob_real = np.log(model['prior_real'])
        log_prob_fake = np.log(model['prior_fake'])
        for j in range(len(model['vocabulary'])):
            if X[i,j] > 0:
                log_prob_real += X[i,j] * np.log(model['conditional_real'][j])
                log_prob_fake += X[i,j] * np.log(model['conditional_fake'][j])
        if log_prob_real > log_prob_fake:
            y_pred.append(1)
        else:
            y_pred.append(0)

    # Calculate accuracy
    accuracy = sum(y_pred == y_true) / len(df)

    return accuracy

# Load the datasets
train_df = pd.read_csv('../datasets/test1.csv')
test_df = pd.read_csv('../datasets/test2.csv')

# Train the Naive Bayes classifier
model = train(train_df)

# Evaluate the Naive Bayes classifier on the testing dataset
accuracy = evaluate(test_df, model)

print('Accuracy: {:.2f}%'.format(accuracy * 100))


Training 0 out of 210007
Training 1 out of 210007
Training 2 out of 210007
Training 3 out of 210007
Training 4 out of 210007
Training 5 out of 210007
Training 6 out of 210007
Training 7 out of 210007
Training 8 out of 210007
Training 9 out of 210007
Training 10 out of 210007
Training 11 out of 210007
Training 12 out of 210007
Training 13 out of 210007
Training 14 out of 210007
Training 15 out of 210007
Training 16 out of 210007
Training 17 out of 210007
Training 18 out of 210007
Training 19 out of 210007
Training 20 out of 210007
Done Training!
Matrix 0 out of 5000
Matrix 1 out of 5000
Matrix 2 out of 5000
Matrix 3 out of 5000
Matrix 4 out of 5000
Matrix 5 out of 5000
Matrix 6 out of 5000
Matrix 7 out of 5000
Matrix 8 out of 5000
Matrix 9 out of 5000
Evaluating 0 out of 5000
Evaluating 1 out of 5000
Evaluating 2 out of 5000
Evaluating 3 out of 5000
Evaluating 4 out of 5000
Evaluating 5 out of 5000
Evaluating 6 out of 5000
Evaluating 7 out of 5000
Evaluating 8 out of 5000
Evaluating 9 o

ValueError: ('Lengths must match to compare', (5000,), (10,))