In [1]:
!pip install pgmpy

Defaulting to user installation because normal site-packages is not writeable
Collecting pgmpy
  Obtaining dependency information for pgmpy from https://files.pythonhosted.org/packages/c7/e6/e451590c2341b3d59d7b613e1af80daefd9e2873f7c9ad3d498ad84e7f44/pgmpy-0.1.26-py3-none-any.whl.metadata
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Collecting torch (from pgmpy)
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/78/18/7a2e56e2dc45a433dea9e1bf46a65e234294c9c470ccb4d4b53025f57b23/torch-2.5.0-cp311-cp311-win_amd64.whl.metadata
  Downloading torch-2.5.0-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting opt-einsum (from pgmpy)
  Obtaining dependency information for opt-einsum from https://files.pythonhosted.org/packages/23/cd/066e86230ae37ed0be70aae89aabf03ca8d9f39c8aea0dec8029455b5540/opt_einsum-3.4.0-py3-none-any.whl.metadata
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting xgboost (from pgmpy)
  Obta



In [44]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [45]:
# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

In [46]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [47]:
# Dropping unnecessary columns and keeping only 'text' and 'label'
data = data[['v1', 'v2']]
data.columns = ['label', 'text']

In [48]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [49]:
# Preprocessing the text data
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove numbers, punctuation, and special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    # Remove whitespace
    text = text.strip()
    return text

In [50]:
data['text'] = data['text'].apply(preprocess_text)


In [51]:
# Convert the labels to binary (spam=1, ham=0)
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

# Vectorize the text data (Bag of Words)
count_vectorizer = CountVectorizer(stop_words='english', max_features=500)
X = count_vectorizer.fit_transform(data['text']).toarray()

In [52]:
data.head()

Unnamed: 0,label,text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [53]:
# Convert X (text features) into a dataframe for easy handling with pgmpy
X = pd.DataFrame(X, columns=count_vectorizer.get_feature_names_out())

# Add the label column to the feature matrix
X['label'] = data['label'].values

# Split the dataset into training and test sets (80% train, 20% test)
train_data, test_data = train_test_split(X, test_size=0.2, random_state=42)

In [54]:
train_data.head()

Unnamed: 0,able,abt,account,actually,address,aft,afternoon,age,ah,aight,...,yeah,year,years,yes,yesterday,yo,youre,yup,ìï,label
1978,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3935,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4078,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4086,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [55]:
test_data.head()

Unnamed: 0,able,abt,account,actually,address,aft,afternoon,age,ah,aight,...,yeah,year,years,yes,yesterday,yo,youre,yup,ìï,label
3245,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
944,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1044,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2484,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
812,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [60]:
# Split the dataset into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)



In [61]:
# Define a simple Bayesian Network structure
# In this example, we'll assume that some words may have dependencies with the label
# The label will be the parent node, and all word counts will depend on it
word_features = list(X.columns[:-1])
edges = [('label', feature) for feature in word_features]


In [62]:
# Create the Bayesian Model
model = BayesianNetwork(edges)


In [63]:
# Fit the model using Maximum Likelihood Estimation (MLE)
model.fit(train_data, estimator=MaximumLikelihoodEstimator)

# Perform inference using Variable Elimination
infer = VariableElimination(model)


In [64]:
# Vectorize the text data (Bag of Words)
count_vectorizer = CountVectorizer(stop_words='english')
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Transform counts to frequencies using TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)




In [66]:

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)



In [67]:
# Evaluate the model
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [68]:

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Confusion Matrix:
[[965   0]
 [ 35 115]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Accuracy: 0.968609865470852


In [65]:
# Make predictions on the test set
def predict_bayesian_network(test_data):
    predictions = []
    for i in range(len(test_data)):
        # Extract evidence for each test instance (the feature values)
        # Drop 'label' to avoid passing it as evidence
        evidence = test_data.drop(columns=['label']).iloc[i].to_dict()

        # Perform the inference to predict the label
        # Use try-except to handle any possible inconsistencies
        try:
            prediction = infer.map_query(variables=['label'], evidence=evidence)
            predictions.append(prediction['label'])
        except Exception as e:
            print(f"Error in prediction for instance {i}: {e}")
            predictions.append(None)  # In case of an error, append None or some default value

    return predictions

# Make predictions
y_test = test_data['label']
y_pred = predict_bayesian_network(test_data)