In [1]:
import os
import numpy as np
import pandas as pd
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD

In [2]:


def read_data_from_csv(path):
    """Load datasets from CSV files.
    Args:
        path (str): Path to the CSV file.
    Returns:
        X (np.ndarray): Features of samples.
        y (np.ndarray): Labels of samples, only provided in the public datasets.
    """
    assert os.path.exists(path), f'File not found: {path}!'
    assert os.path.splitext(path)[
        -1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'

    data = pd.read_csv(path)
    column_list = data.columns.values.tolist()

    if 'Label' in column_list:
        # for the public dataset, label column is provided.
        column_list.remove('Label')
        X = data[column_list].values
        y = data['Label'].values
        return X, y
    else:
        # for the private dataset, label column is not provided.
        X = data[column_list].values
        return X
    

'\nCODE HERE!\n'

In [11]:
X_public, y_public = read_data_from_csv('assignment_8_public.csv')
print('Shape of X_public:', X_public.shape)  # n_sample, m_feature (6499, 22)
print('Shape of y_public:', y_public.shape)  # n_sample (6499,)

Shape of X_public: (6499, 22)
Shape of y_public: (6499,)


In [19]:
# Create a dictionary to map old column names to new column names
# cap-shape	cap-surface	cap-color	bruises	odor	gill-attachment	gill-spacing	gill-size	gill-color	stalk-shape	stalk-root	stalk-surface-above-ring	stalk-surface-below-ring	stalk-color-above-ring	stalk-color-below-ring	veil-type	veil-color	ring-number	ring-type	spore-print-color	population	habitat

new_column_names = {
    0: "cap-shape",
    1: "cap-surface",
    2: "cap-color",
    3: "bruises",
    4: "odor",
    5: "gill-attachment",
    6: "gill-spacing",
    7: "gill-size",
    8: "gill-color",
    9: "stalk-shape",
    10: "stalk-root",
    11: "stalk-surface-above-ring",
    12: "stalk-surface-below-ring",
    13: "stalk-color-above-ring",
    14: "stalk-color-below-ring",
    15: "veil-type",
    16: "veil-color",
    17: "ring-number",
    18: "ring-type",
    19: "spore-print-color",
    20: "population",
    21: "habitat"
}

# Use the rename() method to change the column names
df_x = pd.DataFrame(X_public).rename(columns=new_column_names)
df_y = pd.DataFrame(y_public).rename(columns={0: 'Label'})

df_public = pd.concat([df_x, df_y], axis=1).replace('?',np.nan)

df_public

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,Label
0,convex,fibrous,gray,no,foul,free,close,broad,chocolate,enlarging,...,pink,brown,partial,white,one,large,chocolate,solitary,woods,poisonous
1,knobbed,smooth,red,no,spicy,free,close,narrow,buff,tapering,...,pink,white,partial,white,one,evanescent,white,several,leaves,poisonous
2,convex,fibrous,white,no,none,free,crowded,broad,pink,tapering,...,white,white,partial,white,one,evanescent,brown,abundant,grasses,edible
3,flat,smooth,gray,no,none,free,crowded,broad,chocolate,tapering,...,white,white,partial,white,one,evanescent,brown,abundant,grasses,edible
4,convex,smooth,gray,no,none,free,crowded,broad,chocolate,tapering,...,white,white,partial,white,one,evanescent,brown,scattered,grasses,edible
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6494,bell,smooth,gray,no,none,free,crowded,broad,gray,enlarging,...,white,white,partial,white,two,pendant,white,numerous,grasses,edible
6495,convex,scaly,gray,no,foul,free,close,broad,chocolate,enlarging,...,pink,brown,partial,white,one,large,chocolate,solitary,grasses,poisonous
6496,bell,scaly,yellow,bruises,almond,free,close,broad,black,enlarging,...,white,white,partial,white,one,pendant,black,scattered,meadows,edible
6497,convex,scaly,red,no,spicy,free,close,narrow,buff,tapering,...,white,white,partial,white,one,evanescent,white,several,paths,poisonous


In [18]:
df_public.isna().sum()

cap-shape                      0
cap-surface                    0
cap-color                      0
bruises                        0
odor                           0
gill-attachment                0
gill-spacing                   0
gill-size                      0
gill-color                     0
stalk-shape                    0
stalk-root                  1974
stalk-surface-above-ring       0
stalk-surface-below-ring       0
stalk-color-above-ring         0
stalk-color-below-ring         0
veil-type                      0
veil-color                     0
ring-number                    0
ring-type                      0
spore-print-color              0
population                     0
habitat                        0
Label                          0
dtype: int64

In [20]:
pd.reset_option('^display.', silent=True)
df_public.value_counts(['Label']).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
edible,3359
poisonous,3140


In [21]:
df_public.value_counts(['veil-type']).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,count
veil-type,Unnamed: 1_level_1
partial,6499


In [22]:
df_public.value_counts(['veil-color']).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,count
veil-color,Unnamed: 1_level_1
brown,84
orange,77
white,6332
yellow,6


In [None]:
df_public.value_counts(['veil-color']).sort_index(ascending=True).to_frame()

In [23]:
df_public.value_counts(['ring-number']).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,count
ring-number,Unnamed: 1_level_1
none,28
one,5990
two,481


In [24]:
df_public.value_counts(['ring-type']).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,count
ring-type,Unnamed: 1_level_1
evanescent,2198
flaring,34
large,1061
none,28
pendant,3178


In [None]:
X_private = read_data_from_csv('assignment_8_private.csv')
print('Shape of X_private:', X_private.shape)  # k_sample, m_feature (1625, 22)
# remove and make your own predictions.
preds = [' '] * len(X_private)

'''
CODE HERE!
e.g.,
preds = [' '] * len(X_private)
'''

submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_8.csv', index=True, index_label='Id')