In [44]:
import os
import numpy as np
import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.models import BayesianNetwork
from sklearn.metrics import accuracy_score
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
from sklearn.model_selection import train_test_split
from pgmpy.estimators import HillClimbSearch, BicScore
from pgmpy.estimators import MaximumLikelihoodEstimator

In [45]:


def read_data_from_csv(path):
    """Load datasets from CSV files.
    Args:
        path (str): Path to the CSV file.
    Returns:
        X (np.ndarray): Features of samples.
        y (np.ndarray): Labels of samples, only provided in the public datasets.
    """
    assert os.path.exists(path), f'File not found: {path}!'
    assert os.path.splitext(path)[
        -1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'

    data = pd.read_csv(path)
    column_list = data.columns.values.tolist()

    if 'Label' in column_list:
        # for the public dataset, label column is provided.
        column_list.remove('Label')
        X = data[column_list].values
        y = data['Label'].values
        return X, y
    else:
        # for the private dataset, label column is not provided.
        X = data[column_list].values
        return X
    

In [46]:
X_public, y_public = read_data_from_csv('assignment_8_public.csv')
print('Shape of X_public:', X_public.shape)  # n_sample, m_feature (6499, 22)
print('Shape of y_public:', y_public.shape)  # n_sample (6499,)

Shape of X_public: (6499, 22)
Shape of y_public: (6499,)


In [47]:
# Create a dictionary to map old column names to new column names
# cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	stalk-shape	stalk-root	stalk-surface-above-ring	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat

new_column_names = {
    0: "cap-shape",
    1: "cap-surface",
    2: "cap-color",
    3: "bruises",
    4: "odor",
    5: "gill-attachment",
    6: "gill-spacing",
    7: "gill-size",
    8: "gill-color",
    9: "stalk-shape",
    10: "stalk-root",
    11: "stalk-surface-above-ring",
    12: "stalk-surface-below-ring",
    13: "stalk-color-above-ring",
    14: "stalk-color-below-ring",
    15: "veil-type",
    16: "veil-color",
    17: "ring-number",
    18: "ring-type",
    19: "spore-print-color",
    20: "population",
    21: "habitat"
}

# Use the rename() method to change the column names
df_x = pd.DataFrame(X_public).rename(columns=new_column_names)
df_y = pd.DataFrame(y_public).rename(columns={0: 'Label'})

df_public = pd.concat([df_x, df_y], axis=1).replace('?',np.nan)

df_public

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Label
0,convex,fibrous,gray,no,foul,free,close,broad,chocolate,enlarging,...,pink,brown,partial,white,one,large,chocolate,solitary,woods,poisonous
1,knobbed,smooth,red,no,spicy,free,close,narrow,buff,tapering,...,pink,white,partial,white,one,evanescent,white,several,leaves,poisonous
2,convex,fibrous,white,no,none,free,crowded,broad,pink,tapering,...,white,white,partial,white,one,evanescent,brown,abundant,grasses,edible
3,flat,smooth,gray,no,none,free,crowded,broad,chocolate,tapering,...,white,white,partial,white,one,evanescent,brown,abundant,grasses,edible
4,convex,smooth,gray,no,none,free,crowded,broad,chocolate,tapering,...,white,white,partial,white,one,evanescent,brown,scattered,grasses,edible
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6494,bell,smooth,gray,no,none,free,crowded,broad,gray,enlarging,...,white,white,partial,white,two,pendant,white,numerous,grasses,edible
6495,convex,scaly,gray,no,foul,free,close,broad,chocolate,enlarging,...,pink,brown,partial,white,one,large,chocolate,solitary,grasses,poisonous
6496,bell,scaly,yellow,bruises,almond,free,close,broad,black,enlarging,...,white,white,partial,white,one,pendant,black,scattered,meadows,edible
6497,convex,scaly,red,no,spicy,free,close,narrow,buff,tapering,...,white,white,partial,white,one,evanescent,white,several,paths,poisonous


In [5]:
df_public.isna().sum().sum()

1974

In [15]:
df_public.value_counts(['Label']).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
edible,3359
poisonous,3140


In [48]:
# guessing stalk-root
'cap-shape', 
'odor', 
'gill-spacing', 
['stalk-surface-above-ring', 'stalk-surface-below-ring'],
['stalk-color-above-ring', 'stalk-color-below-ring']

['stalk-color-above-ring', 'stalk-color-below-ring']

By above analysis on stalk-root, we can fill the missing data with the rule:
1. odor(foul) --> bulbous
2. stalk-color-above-ring(brown | buff | gray | pink) --> bulbous
3. stalk-surface-above-ring(silky) & stalk-surface-below-ring(silky) --> bulbous
4. odor(pungent) --> equal



In [49]:
df_puNew = df_public

# Fill missing values based on mappings
df_puNew.loc[(df_puNew['odor'] == 'foul') & (df_puNew['stalk-root'].isna()), 'stalk-root'] = 'bulbous'
df_puNew.loc[(df_puNew['stalk-color-above-ring'].isin(['brown', 'buff', 'gray', 'pink'])) & (df_puNew['stalk-root'].isna()), 'stalk-root'] = 'bulbous'
df_puNew.loc[(df_puNew['stalk-surface-above-ring'] == 'silky') & (df_puNew['stalk-surface-below-ring'] == 'silky') & (df_puNew['stalk-root'].isna()), 'stalk-root'] = 'bulbous'
df_puNew.loc[(df_puNew['odor'] == 'pungent') & (df_puNew['stalk-root'].isna()), 'stalk-root'] = 'equal'


df_puNew.loc[(df_puNew['gill-spacing'] == 'close') & (df_puNew['stalk-root'].isna()), 'stalk-root'] = 'bulbous'
df_puNew.loc[(df_puNew['stalk-root'].isna()), 'stalk-root'] = 'equal'


In [17]:
df_puNew.isna().sum().sum()

0

In [32]:
df_puNew

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Label
0,convex,fibrous,gray,no,foul,free,close,broad,chocolate,enlarging,...,pink,brown,partial,white,one,large,chocolate,solitary,woods,poisonous
1,knobbed,smooth,red,no,spicy,free,close,narrow,buff,tapering,...,pink,white,partial,white,one,evanescent,white,several,leaves,poisonous
2,convex,fibrous,white,no,none,free,crowded,broad,pink,tapering,...,white,white,partial,white,one,evanescent,brown,abundant,grasses,edible
3,flat,smooth,gray,no,none,free,crowded,broad,chocolate,tapering,...,white,white,partial,white,one,evanescent,brown,abundant,grasses,edible
4,convex,smooth,gray,no,none,free,crowded,broad,chocolate,tapering,...,white,white,partial,white,one,evanescent,brown,scattered,grasses,edible
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6494,bell,smooth,gray,no,none,free,crowded,broad,gray,enlarging,...,white,white,partial,white,two,pendant,white,numerous,grasses,edible
6495,convex,scaly,gray,no,foul,free,close,broad,chocolate,enlarging,...,pink,brown,partial,white,one,large,chocolate,solitary,grasses,poisonous
6496,bell,scaly,yellow,bruises,almond,free,close,broad,black,enlarging,...,white,white,partial,white,one,pendant,black,scattered,meadows,edible
6497,convex,scaly,red,no,spicy,free,close,narrow,buff,tapering,...,white,white,partial,white,one,evanescent,white,several,paths,poisonous


In [None]:
# Convert categorical variables to numerical labels
# data_encoded = df_puNew.apply(lambda x: pd.factorize(x)[0])

# # Split the dataset into features and target
# features = data_encoded.drop('Label', axis=1)
# target = data_encoded['Label']
# data_encoded

In [50]:
df_spFeatures = df_puNew.drop(['Label'], axis=1)
df_spLabel = df_puNew['Label']

x_train, x_test, y_train, y_test = train_test_split(df_spFeatures, df_spLabel, test_size=0.1, random_state=42)

In [22]:
y_train.shape

(5849,)

In [51]:
# Define the structure
model= BayesianNetwork([
    ('spore-print-color','Label'),
    ('ring-type','Label'),
    ('stalk-surface-below-ring','Label'),
    ('gill-size','Label'),
    ('stalk-surface-above-ring','Label'),
    ('bruises','Label'),
])

model.fit(pd.concat([x_train, y_train], axis=1), estimator=MaximumLikelihoodEstimator)

In [52]:

# Create an inference object
infer = VariableElimination(model)

# Define the evidence with valid state names
evidence = {'spore-print-color': 'brown', 'ring-type': 'flaring'}

# Perform inference
print('\nProbability of mushroom given evidence:', evidence)
q = infer.query(variables=['Label'], evidence=evidence)
print(q)

# Make the prediction
prediction = infer.map_query(variables=['Label'], evidence=evidence)
print('Predicted label:', prediction['Label'])


Probability of mushroom given evidence: {'spore-print-color': 'brown', 'ring-type': 'flaring'}
+------------------+--------------+
| Label            |   phi(Label) |
| Label(edible)    |       0.5000 |
+------------------+--------------+
| Label(poisonous) |       0.5000 |
+------------------+--------------+


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Predicted label: edible


In [53]:
# Learn the structure
hc = HillClimbSearch(df_puNew)
best_model_structure = hc.estimate(scoring_method=BicScore(df_puNew))

# Convert the learned structure to a BayesianModel
best_model = BayesianNetwork(best_model_structure.edges())

# Learn the parameters
best_model.fit(df_puNew, estimator=MaximumLikelihoodEstimator)

# Print the edges of the model
print("\nEdges in the model:")
for edge in best_model.edges():
    print(edge)

  0%|          | 0/1000000 [00:00<?, ?it/s]


Edges in the model:
('cap-color', 'odor')
('cap-color', 'stalk-shape')
('cap-color', 'gill-spacing')
('odor', 'Label')
('odor', 'spore-print-color')
('odor', 'stalk-root')
('odor', 'stalk-shape')
('odor', 'gill-spacing')
('odor', 'gill-size')
('odor', 'cap-surface')
('odor', 'bruises')
('stalk-shape', 'spore-print-color')
('stalk-shape', 'gill-color')
('stalk-shape', 'bruises')
('stalk-shape', 'stalk-color-above-ring')
('stalk-shape', 'stalk-color-below-ring')
('stalk-shape', 'ring-type')
('stalk-shape', 'cap-shape')
('stalk-shape', 'Label')
('stalk-shape', 'gill-size')
('gill-spacing', 'ring-type')
('gill-spacing', 'stalk-root')
('gill-spacing', 'stalk-surface-above-ring')
('bruises', 'gill-color')
('gill-color', 'habitat')
('gill-color', 'gill-size')
('Label', 'stalk-color-above-ring')
('Label', 'stalk-surface-above-ring')
('Label', 'stalk-surface-below-ring')
('spore-print-color', 'gill-color')
('spore-print-color', 'ring-type')
('spore-print-color', 'cap-shape')
('spore-print-colo

In [None]:
# # Create an inference object
# infer = VariableElimination(best_model)

# # Define the evidence
# evidence = {'restecg': 1, 'cp': 2}

# # Remove evidence variables not in the model
# evidence_ = {var: val for var, val in evidence.items() if var in best_model.nodes()}

# # Perform inference
# print('\nProbability of HeartDisease given evidence= ' + str(evidence))
# q = infer.query(variables=['heartdisease'], evidence=evidence_)
# print(q)

# # Make the prediction
# print(infer.map_query(variables=['heartdisease'], evidence=evidence_))

In [54]:
%%capture

# Create an inference object
infer = VariableElimination(best_model)

# Predict the 'Label' column
predictions = []

for index, data_point in x_test.iterrows():
    evidence = data_point.to_dict()
    evidence_ = {var: val for var, val in evidence.items() if var in best_model.nodes() and pd.isna(val)==False}
    print(evidence_)
    prediction = infer.map_query(variables=['Label'], evidence=evidence_)
    predictions.append(prediction['Label'])

print("predictions is completed.")

In [55]:
# Count the occurrences of each label
label_counts = {
    'edible': predictions.count('edible'),
    'poisonous': predictions.count('poisonous')
}

# Print the label counts
for label, count in label_counts.items():
    print(f"{label}: {count}")

edible: 347
poisonous: 303


In [58]:
# Calculate the accuracy
accuracy = accuracy_score(y_test, predictions)
accuracy

0.9907692307692307

# Output the private data

In [57]:
X_private = read_data_from_csv('assignment_8_private.csv')
print('Shape of X_private:', X_private.shape)  # k_sample, m_feature (1625, 22)

Shape of X_private: (1625, 22)


In [59]:
df_private = pd.DataFrame(X_private)
df_private.isna().sum().sum()   # no missing value

0

In [60]:
%%capture

# Create an inference object
infer_private  = VariableElimination(best_model)

# Predict the 'Label' column
preds = []

for index, data_point in df_private.iterrows():
    evidence = data_point.to_dict()
    evidence_ = {var: val for var, val in evidence.items() if var in best_model.nodes() and pd.isna(val)==False}
    prediction = infer_private.map_query(variables=['Label'], evidence=evidence_)
    preds.append(prediction['Label'])

print("predictions is completed.")

In [61]:
# Count the occurrences of each label
label_counts = {
    'edible': preds.count('edible'),
    'poisonous': preds.count('poisonous')
}

# Print the label counts
for label, count in label_counts.items():
    print(f"{label}: {count}")

edible: 1625
poisonous: 0


In [62]:
submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_8.csv', index=True, index_label='Id')