In [10]:
!pip install pgmpy==0.1.23



In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator

In [12]:
# Step 1: Load and clean the data
data = pd.read_csv("spam.csv", encoding='latin-1')
cleaned_data = data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})
cleaned_data['label'] = cleaned_data['label'].map({'ham': 0, 'spam': 1})

In [13]:
# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_data['message'], cleaned_data['label'], test_size=0.2, random_state=42
)

In [14]:
# Step 3: Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [15]:
# Step 4: Convert TF-IDF features to binary (presence/absence of words)
X_train_binary = (X_train_tfidf > 0).astype(int).toarray()
X_test_binary = (X_test_tfidf > 0).astype(int).toarray()

In [16]:
# Step 5: Create a Bayesian Network model
nodes = [f'word_{i}' for i in range(X_train_binary.shape[1])]
nodes.insert(0, 'label')  # Add 'label' node

In [17]:
# Define edges where the label influences all word nodes
edges = [('label', f'word_{i}') for i in range(1000)]
model = BayesianNetwork(edges)

In [18]:
# Step 6: Prepare the training data as a DataFrame
train_data_df = pd.DataFrame(X_train_binary, columns=nodes[1:])
train_data_df['label'] = y_train.values

In [None]:
# Train the model using Maximum Likelihood Estimation
model.fit(train_data_df, estimator=MaximumLikelihoodEstimator)

In [None]:
# Step 7: Make predictions on the test data
test_data_df = pd.DataFrame(X_test_binary, columns=nodes[1:])
y_pred = model.predict(test_data_df)['label']

In [None]:
# Step 8: Evaluate the model
report = classification_report(y_test, y_pred, target_names=['Ham', 'Spam'])
print(report)
