In [3]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd

In [4]:

from types import SimpleNamespace

ucirepo_ids = {
    "iris": 53,
    "mushroom": 73,
    "car_eval": 19, 
}
TEST_SIZE = 0.2

def custom_data():
    
    data = {
            'age': ['youth', 'youth', 'middle aged', 'senior', 'senior', 'senior', 'middle aged',
                    'youth', 'youth', 'senior', 'youth', 'middle aged', 'middle aged', 'senior'],
            'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low',
                    'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
            'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes',
                        'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
            'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent',
                            'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
            'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes',
                            'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
        }
        
    df = pd.DataFrame(data)
    
    # Split into features and target
    features = df.drop(columns='buys_computer')
    targets = df['buys_computer']
    targets= pd.DataFrame(targets.values.reshape(-1, 1), columns=['buys_computer'])

    
    # Variable info
    variable_info = {
        col: {
            'type': 'categorical',
            'unique_values': df[col].unique().tolist()
        } for col in df.columns
    }

    # Metadata
    metadata = {
        'source': 'Simulated AllElectronics dataset',
        'description': 'Customer attributes and their decision to buy a computer',
        'num_samples': len(df),
        'num_features': features.shape[1],
        'target_column': 'buys_computer',
        'class_labels': sorted(df['buys_computer'].unique().tolist())
    }

    # Build nested structure
    return SimpleNamespace(
        data=SimpleNamespace(
            features=features,
            targets=targets,
            feature_names=features.columns.tolist(),
            target_names=sorted(targets.iloc[:,0].unique()),
            # frame=df
        ),
        metadata=metadata,
        variables=variable_info
    )
    
    

def fetch_dataframe(dataframe_name):
    
    if dataframe_name == "custom_data":
        df = custom_data()
        
        # metadata 
        print(df.metadata) 
        
        # variable information 
        print(df.variables) 
    
        return df
    
    if dataframe_name in ucirepo_ids:
        # fetch dataset 
        df = fetch_ucirepo(id=ucirepo_ids[dataframe_name],) 

        # # data (as pandas dataframes) 
        X = df.data.features 
        y = df.data.targets 
        
        # metadata 
        print(df.metadata) 
        
        # variable information 
        print(df.variables) 
        
        return df
    else:
        raise ValueError(f"Dataset '{dataframe_name}' not found in UCI repository.")
    


In [5]:
def __train_test_split(X, y, test_size = 0.2, shuffle_and_stratify = True):
    
    if test_size < 0 or test_size > 1:
        raise ValueError("test_size must be between 0 and 1")
   
    if len(X) != len(y):
        raise ValueError("Features and targets must have the same length.")

    
    if shuffle_and_stratify == False:
    
        train_size = 1 - test_size
        train_index = int(len(X) * train_size)
        
        X_train = X[0: train_index]
        X_test = X[train_index:]
        
        y_train = y[0: train_index]
        y_test = y[train_index:]
        
        return X_train, X_test, y_train, y_test
    else:
        labels = y.iloc[:,0].unique()
        X_train = pd.DataFrame(columns=X.columns)
        y_train = pd.DataFrame(columns=y.columns)
        X_test = pd.DataFrame(columns=X.columns)
        y_test = pd.DataFrame(columns=y.columns)
        
        train_size = 1 - test_size
        

        for label in labels :
            y_rows = y[y.iloc[:,0] == label]            
            X_rows = X.loc[y_rows.index]
            
            train_index = int(len(X_rows) * train_size)
            
            X_train = pd.concat([X_train, X_rows.iloc[:train_index]], ignore_index=False)
            y_train = pd.concat([y_train, y_rows.iloc[:train_index]] , ignore_index=False)
            
            X_test = pd.concat([X_test, X_rows[train_index:]], ignore_index=False)
            y_test = pd.concat([y_test, y_rows[train_index:]], ignore_index=False)

        return X_train, X_test, y_train, y_test
    

In [8]:
df = fetch_dataframe("car_eval")

print(df.values)

{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

In [9]:

from sklearn.model_selection import train_test_split

X = df.data.features
y = df.data.targets

X_train, X_test, y_train, y_test = __train_test_split(X, y , test_size=TEST_SIZE, shuffle_and_stratify=True)



# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE)


print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (1382, 6)
X_test shape: (346, 6)
y_train shape: (1382, 1)
y_test shape: (346, 1)


In [10]:
X_train

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,low
1,vhigh,vhigh,2,2,small,med
2,vhigh,vhigh,2,2,small,high
3,vhigh,vhigh,2,2,med,low
4,vhigh,vhigh,2,2,med,med
...,...,...,...,...,...,...
1645,low,low,2,more,big,med
1658,low,low,3,4,small,high
1661,low,low,3,4,med,high
1663,low,low,3,4,big,med


In [11]:
y_train

Unnamed: 0,class
0,unacc
1,unacc
2,unacc
3,unacc
4,unacc
...,...
1645,good
1658,good
1661,good
1663,good


In [10]:
# labels = y_train.iloc[:,0].unique()

# for label in labels:
    
#     label_rows = y_train[y_train.iloc[:,0] == label]
    
#     row_indices = label_rows.index
    
#     print(row_indices)
    
#     corresponding_rows = X_train.loc[label_rows.index]
    
#     for ind in row_indices:
        
        
        
#         if ind not in X_train.index:
#             print(f"{ind} not in X_train for label {label}")
    

In [11]:

from math import exp, sqrt, pi
    
def gaussian(x_k, mean, std):
    
    if std == 0:
        return 1 if x_k == mean else 0
    else:
        exponent = exp(-((x_k - mean)**2 ) / (2*std**2))
        gaussian_val = (1 / (sqrt(2*pi*std**2))) * exponent
        return gaussian_val

def naive_bayes_classifier_preds(X_train,y_train, X_test):
    y_preds = []
    attributes = X_train.columns
    labels = y_train.iloc[:,0].unique()
    
    print("Attributes:", attributes)
    print("Class Labels:", labels)
    
    # print("y_train:\n", y_train)
    
    label_probs = {}
    cond_probs = {}
    mean_store = {}
    std_store = {}
    
    
    for label in labels:
        label_probs[label] = (y_train.iloc[:,0] == label).sum() / len(y_train) # Calculate prior probabilities for each label
        
        label_rows = y_train[y_train.iloc[:,0] == label]
        
        # print(f"Label: {label}, Probability: {label_probs[label]} count: {(y_train.iloc[:,0] == label).sum() }")
        # print("Label Rows: \n", label_rows)
        
        for attr in attributes:
            if type(X_train[attr].iloc[0]) == str: #categotrical attribute
                corresponding_rows = X_train.loc[label_rows.index]
                attr_value_counts = corresponding_rows[attr].value_counts().to_dict()
                
                for attr_value, count in attr_value_counts.items():
                    cond_probs[(attr, attr_value, label)] = count / len(corresponding_rows)
            elif type(X_train[attr].iloc[0]) == float or type(X_train[attr].iloc[0]) == int: # numerical attribute
                corresponding_rows = X_train.loc[label_rows.index]
                mean = corresponding_rows[attr].mean()
                std = corresponding_rows[attr].std()
                
                mean_store[(attr, label)] = mean
                std_store[(attr, label)] = std
                
                
    # print("Label Probabilities:", label_probs)
    # print("Conditional Probabilities:", cond_probs)
    
    # for key, value in cond_probs.items():
    #     print(f"Conditional Probability {key}: {value}")
    
    # Create one test row
    # test_data = {
    #     "age":   ["youth"],
    #     "income":    ["medium"],
    #     "student":    ["yes"],
    #     "credit_rating":  ["fair"],
    # }

    # test_df = pd.DataFrame(test_data)

    # print(test_df)
    
    
    
    for index, test_row in X_test.iterrows():
        
        prob_max_finder = {}
        
        for label in labels:
            prob = label_probs[label]
            
            for attr in attributes:
                
                attr_value = test_row[attr]
                
                if type(attr_value) == str:
            
                    prob *= cond_probs.get((attr, attr_value , label), 1e-6) 
                
                elif type(attr_value) == float or type(attr_value) == int:
                    
                    mean = mean_store[(attr, label)]
                    std = std_store[(attr, label)] + 1e-7
                    
                    prob *= gaussian(x_k=attr_value, mean=mean, std=std) 
            
            prob_max_finder[label] = prob
            
            
        max_label = max(prob_max_finder, key=prob_max_finder.get)
        
        y_preds.append(max_label)
    
            
    
    # print("Predictions:", y_preds)
    print("Predictions Length:", len(y_preds))
    
    return y_preds
    

y_preds = naive_bayes_classifier_preds(X_train, y_train, X_test)

Attributes: Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], dtype='object')
Class Labels: ['unacc' 'acc' 'vgood' 'good']
Predictions Length: 346


In [12]:
def __classification_report(y_true, y_pred):
    
    if len(y_true.iloc[:,0]) != len(y_pred):
        raise NotImplementedError("prediction does not have same number of tuples as the true value set")
    
    labels = y_true.iloc[:, 0].unique()
    
    results = {}
    
    for label in labels:
        P = N = TP = FP = TN = FN = 0
        
        for i in range(len(y_pred)):
            true_label = y_true.iloc[i,0]
            pred_label = y_pred[i]
            
            if true_label == label:
                P += 1
            else:
                N += 1
                
            if true_label==label and pred_label==label:
                TP += 1
            elif true_label==label and pred_label!=label:
                FN += 1
            elif true_label!=label and pred_label==label:
                FP += 1
            elif true_label!=label and pred_label!=label:
                TN += 1
                
            
            accuracy = (TP+TN) / (P+N)
            precision = TP / (TP + FP)
            recall = TP / (TP + FN)
            f1_score = (2 * precision * recall) / (precision + recall)
            sensitivity = TP/P
            specificity = TN/N            
            support = P + N
            
            ################# what if P,N either or both 0 ????
            
            print(f"Label: {label}")
            print(f"  Accuracy   : {accuracy:.2f}")
            print(f"  Precision  : {precision:.2f}")
            print(f"  Recall     : {recall:.2f}")
            print(f"  F1 Score   : {f1_score:.2f}")
            print(f"  Sensitivity: {sensitivity:.2f}")
            print(f"  Specificity: {specificity:.2f}")
            print(f"  Support    : {support}")
            print("-" * 30)
            
            
            
        
# __classification_report(y_true=y_test, y_pred = y_preds)

In [13]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_preds)

print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

         acc       0.96      0.69      0.80        77
        good       0.32      1.00      0.48        14
       unacc       1.00      0.98      0.99       242
       vgood       0.70      0.54      0.61        13

    accuracy                           0.90       346
   macro avg       0.74      0.80      0.72       346
weighted avg       0.95      0.90      0.91       346



In [14]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder


X_sk = df.data.features
y_sk = df.data.targets

encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X_sk)

print(f"X_encoded {X_encoded}")

X_train_sk, X_test_sk, y_train_sk, y_test_sk = train_test_split(X_encoded, y_sk , test_size=0.2, random_state=42)

print("X_train", X_train_sk)

model = CategoricalNB()  # Use CategoricalNB for categorical data
# model = GaussianNB()  # Use GaussianNB for numerical data

model.fit(X_train_sk, y_train_sk)

y_preds_sklearn = model.predict(X_test_sk)

report_sklearn = classification_report(y_test_sk, y_preds_sklearn)

print("Classification Report (sklearn):")
print(report_sklearn)

X_encoded [[3. 3. 0. 0. 2. 1.]
 [3. 3. 0. 0. 2. 2.]
 [3. 3. 0. 0. 2. 0.]
 ...
 [1. 1. 3. 2. 0. 1.]
 [1. 1. 3. 2. 0. 2.]
 [1. 1. 3. 2. 0. 0.]]
X_train [[3. 3. 3. 2. 0. 0.]
 [2. 3. 1. 1. 2. 2.]
 [1. 1. 3. 0. 0. 0.]
 ...
 [0. 1. 3. 2. 1. 0.]
 [1. 0. 2. 0. 2. 2.]
 [2. 2. 1. 2. 2. 2.]]
Classification Report (sklearn):
              precision    recall  f1-score   support

         acc       0.63      0.54      0.58        83
        good       0.57      0.36      0.44        11
       unacc       0.87      0.97      0.91       235
       vgood       1.00      0.35      0.52        17

    accuracy                           0.82       346
   macro avg       0.77      0.56      0.62       346
weighted avg       0.81      0.82      0.80       346



  y = column_or_1d(y, warn=True)


In [15]:
ys = [[y_test.iloc[i,0], y_preds[i], y_preds_sklearn[i]] for i in range(len(y_test))]

print("Comparison of Predictions:")
for i, (actual, naive_bayes_pred, sklearn_pred) in enumerate(ys):
    print(f"Sample {i+1}: Actual: {actual}, Naive Bayes: {naive_bayes_pred}, sklearn: {sklearn_pred}")


Comparison of Predictions:
Sample 1: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 2: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 3: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 4: Actual: unacc, Naive Bayes: unacc, sklearn: acc
Sample 5: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 6: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 7: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 8: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 9: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 10: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 11: Actual: unacc, Naive Bayes: unacc, sklearn: acc
Sample 12: Actual: unacc, Naive Bayes: unacc, sklearn: acc
Sample 13: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 14: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 15: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 16: Actual: unacc, Naive Bayes: unacc, sklearn: unacc
Sample 17: A

In [16]:
train_test_split?

[31mSignature:[39m
train_test_split(
    *arrays,
    test_size=[38;5;28;01mNone[39;00m,
    train_size=[38;5;28;01mNone[39;00m,
    random_state=[38;5;28;01mNone[39;00m,
    shuffle=[38;5;28;01mTrue[39;00m,
    stratify=[38;5;28;01mNone[39;00m,
)
[31mDocstring:[39m
Split arrays or matrices into random train and test subsets.

Quick utility that wraps input validation,
``next(ShuffleSplit().split(X, y))``, and application to input data
into a single call for splitting (and optionally subsampling) data into a
one-liner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float or int, default=None
    If float, should be between 0.0 and 1.0 and represent the proportion
    of the dataset to include in the test split. If int, represents the
    absolute number of test samples. If None