In [3]:
!pip install pgmpy==0.1.23

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

# Keep relevant columns
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

# Encode the labels: ham -> 0, spam -> 1
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

print(data.head())


Collecting pgmpy==0.1.23
  Downloading pgmpy-0.1.23-py3-none-any.whl.metadata (6.3 kB)
Downloading pgmpy-0.1.23-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.23
   label                                            message
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator

# Step 1: Load and clean the data
data = pd.read_csv("spam.csv", encoding='latin-1')
cleaned_data = data[['v1', 'v2']].rename(columns={'v1': 'label', 'v2': 'message'})
cleaned_data['label'] = cleaned_data['label'].map({'ham': 0, 'spam': 1})

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    cleaned_data['message'], cleaned_data['label'], test_size=0.2, random_state=42
)

# Step 3: Vectorize text data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 4: Convert TF-IDF features to binary (presence/absence of words)
X_train_binary = (X_train_tfidf > 0).astype(int).toarray()
X_test_binary = (X_test_tfidf > 0).astype(int).toarray()

# Step 5: Create a Bayesian Network model
nodes = [f'word_{i}' for i in range(X_train_binary.shape[1])]
nodes.insert(0, 'label')  # Add 'label' node

# Define edges where the label influences all word nodes
edges = [('label', f'word_{i}') for i in range(1000)]
model = BayesianNetwork(edges)

# Step 6: Prepare the training data as a DataFrame
train_data_df = pd.DataFrame(X_train_binary, columns=nodes[1:])
train_data_df['label'] = y_train.values

# Train the model using Maximum Likelihood Estimation
model.fit(train_data_df, estimator=MaximumLikelihoodEstimator)

# Step 7: Make predictions on the test data
test_data_df = pd.DataFrame(X_test_binary, columns=nodes[1:])
y_pred = model.predict(test_data_df)['label']

# Step 8: Evaluate the model
report = classification_report(y_test, y_pred, target_names=['Ham', 'Spam'])
print(report)


  0%|          | 0/1012 [00:00<?, ?it/s]