# Digit Recognition Karas
(by: Usere Namen, Heilbronn University, Germany, October 2024) 

In this notebook we will build... (Description)


### Import Naive Bayes Classifier implementation

In [None]:
from naive_bayes_classifier import *
import numpy as np

### Convert CSV File to a Pandas DataFrame

In this step, we load a CSV file into a Pandas DataFrame using the `pd.read_csv()` function. We specify the delimiter as a semicolon (`;`) since the CSV file is separated by semicolons. After loading the data, we strip any leading or trailing whitespace from the column names using the `str.strip()` method to ensure clean headers. Finally, we print the DataFrame to verify the contents.


In [None]:
# Convert csv file to a pandas DataFrame
df = pd.read_csv('inflammation_diagnosis.csv', sep=';')

# Strip any leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Print the DataFrame
print(df)

### Plot the CSV

In [None]:
import matplotlib.pyplot as plt

# Identify categorical columns dynamically (in this case, 'inflammation' and 'nephritis')
categorical_columns = df.select_dtypes(include=['object']).columns

# Create a figure dynamically for each categorical feature
fig, axes = plt.subplots(1, len(categorical_columns), figsize=(5 * len(categorical_columns), 6))

# If there's only one categorical column, we need to ensure 'axes' is treated as a list
if len(categorical_columns) == 1:
    axes = [axes]

# Iterate over each categorical column and plot the distribution
for i, column in enumerate(categorical_columns):
    # Get the value counts for each class in the column
    value_counts = df[column].value_counts()
    
    # Plot the data
    axes[i].bar(value_counts.index, value_counts.values, color='skyblue', edgecolor='black')
    axes[i].set_title(f'Distribution of {column.capitalize()}')
    axes[i].set_xlabel(column.capitalize())
    axes[i].set_ylabel('Count')

# Adjust layout to avoid overlap
plt.tight_layout()
plt.show()


### Create 'disease' Column and Drop Unnecessary Columns

We begin by mapping the values of the 'inflammation' and 'nephritis' columns from 'yes'/'no' to `True`/`False`. Then, we define a function `classify_disease()` that categorizes each row into 'very sick', 'sick', or 'healthy' based on the values of 'inflammation' and 'nephritis'. This function is applied to each row of the DataFrame to create a new 'disease' column.

Afterward, the original 'inflammation' and 'nephritis' columns are dropped, leaving us with the 'disease' column. Finally, the 'disease' column is also dropped to create a `df_complete` DataFrame, which is printed at the end.

In [None]:
# Map 'yes'/'no' to True/False for inflammation and nephritis columns
df['inflammation'] = df['inflammation'].map({'yes': True, 'no': False})
df['nephritis'] = df['nephritis'].map({'yes': True, 'no': False})

# Function to classify disease
def classify_disease(row):
    if row['inflammation'] and row['nephritis']:  # Both inflammation and nephritis are True
        return 'very sick'
    elif row['inflammation'] or row['nephritis']:  # One of them is True
        return 'sick'
    else:
        return 'healthy'

# Apply classification function to each row
df['disease'] = df.apply(classify_disease, axis=1)

# Drop the 'inflammation' and 'nephritis' columns
df = df.drop(columns=['inflammation', 'nephritis'])

# Create the df_complete by dropping the 'disease' column as well
df_complete = df.drop(columns=['disease'])

# Print the final DataFrame
print(df_complete)


### Plot the updated Data

In [None]:
import matplotlib.pyplot as plt

# Identify categorical columns dynamically (in this case, 'inflammation' and 'nephritis')
categorical_columns = df.select_dtypes(include=['object']).columns

# Create a figure dynamically for each categorical feature
fig, axes = plt.subplots(1, len(categorical_columns), figsize=(5 * len(categorical_columns), 6))

# If there's only one categorical column, we need to ensure 'axes' is treated as a list
if len(categorical_columns) == 1:
    axes = [axes]

# Iterate over each categorical column and plot the distribution
for i, column in enumerate(categorical_columns):
    # Get the value counts for each class in the column
    value_counts = df[column].value_counts()
    
    # Plot the data
    axes[i].bar(value_counts.index, value_counts.values, color='skyblue', edgecolor='black')
    axes[i].set_title(f'Distribution of {column.capitalize()}')
    axes[i].set_xlabel(column.capitalize())
    axes[i].set_ylabel('Count')

# Adjust layout to avoid overlap
plt.tight_layout()
plt.show()


### Create 'continuous' Array

In this step, we generate a `continuous` array that indicates whether each column in the DataFrame contains continuous or discrete values. We iterate over every column in the DataFrame, excluding the label column, and check the first value of each column. If the value is 'yes' or 'no', we treat it as a discrete column, and append `False` to the `continuous` array. Otherwise, we assume the column contains continuous data and append `True`.


In [None]:
# Initialize the 'continuous' array
continuous = []

# Iterate over every column except the label column
for column in df.columns[:-1]:  
    first_value = df[column].iloc[0]
    
    # Check if the first value is discrete (yes/no)
    if first_value == 'yes' or first_value == 'no':
        continuous.append(False)
    else:  # Otherwise, it's continuous
        continuous.append(True)
        
# Print the 'continuous' array
print(continuous)

### Randomize DataFrame and Split into Train and Test Sets

In this step, we shuffle the DataFrame using the `sample()` method with a fraction of `1` to randomize the entire dataset. We set the `random_state` to ensure reproducibility of the shuffle. After shuffling, the DataFrame is split into an 80% training set and a 20% test set. The training set is extracted from the first 80% of the shuffled DataFrame, and the test set is from the remaining 20%. Both sets are then printed to verify the split.


In [None]:
# Shuffle the DataFrame and reset the index
shuffle_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Calculate the size of the training set (80% of the data)
train_size = int(0.8 * len(shuffle_df))

# Split the DataFrame into training and test sets
train_df = shuffle_df.iloc[:train_size]
test_df = shuffle_df.iloc[train_size:]

# Print the training and test sets
print(train_df)
print(test_df)

### Instantiate the NaiveBayes Class

In this step, we instantiate the `NaiveBayes` class, which will be used to perform classification. The constructor takes in the `continuous` array, which specifies which features in the dataset are continuous or discrete. This information is essential for the Naive Bayes classifier to handle the different types of data appropriately.


In [None]:
# Instantiate the NaiveBayes class, passing the continuous array
naive_bayes = NaiveBayes(continuous)

### Fit the NaiveBayes Classifier to the Training Data

We now fit the `NaiveBayes` classifier to the training dataset. The `fit()` method takes the training DataFrame (`train_df`) and the target column name (`"disease"`), which contains the labels we are trying to predict. This step trains the model by learning the underlying patterns and relationships between the features and the target variable.


In [None]:
# Define the target column name
target_name = "disease"

# Fit the NaiveBayes classifier on the training data
naive_bayes.fit(train_df, target_name)

In [None]:
print(naive_bayes.priors)

In [None]:
print(naive_bayes.likelihoods)

In [None]:
print(naive_bayes.gaussian_parameters)

### Predict Probabilities with NaiveBayes Classifier

After fitting the model, we can use the `predict_probability()` method to estimate the probability distribution of the target variable for each instance in the dataset. Here, we pass the `df_complete` DataFrame (which contains the features but not the target column) to the method to generate these probability predictions.


In [None]:
# Predict the probabilities for each instance in df_complete
naive_bayes.predict_probability(df_complete)

### Separate the Last Column and Evaluate the NaiveBayes Classifier

In this step, we separate the last column of the test set into two parts: `test_data`, which contains the feature columns, and `test_labels`, which contains the true labels for the test set. We then evaluate the performance of the NaiveBayes classifier using the `evaluate_on_data()` method. This method compares the model’s predictions to the actual labels in the test set and provides an evaluation metric.

In [None]:
# Separate the features and labels in the test set
test_data = test_df.iloc[:, :-1]  # All columns except the last
test_labels = test_df.iloc[:, -1]  # The last column (target)

# Evaluate the NaiveBayes classifier and store the accuracy
accuracy, confusion_matrix = naive_bayes.evaluate_on_data(test_data, test_labels)

# Print the accuracy
print(f"Accuracy of the NaiveBayes classifier: {accuracy * 100:.2f}%")

from sklearn.metrics import ConfusionMatrixDisplay

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix)

# Convert the confusion matrix to a numpy array
confusion_matrix = np.array(confusion_matrix)

# Plotting the confusion matrix
fig, ax = plt.subplots()
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix)
disp.plot(ax=ax, cmap="Blues")

# Adding the title and accuracy annotation
ax.set_title(f"Confusion Matrix\nAccuracy: {accuracy * 100:.2f}%")
plt.show()


### Visualize Data with Matplotlib

To gain more insight into the data, we will visualize it using `matplotlib`. We will create a bar plot that shows the distribution of the predicted disease categories for the test data. This visualization will help us understand how the classifier's predictions are distributed across the different disease classes.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Use predict_probability to get predictions and probabilities
results = naive_bayes.predict_probability(test_data)

# Extract the predicted labels from the 'Prediction' column
predicted_labels = results['Prediction']

# Count the occurrences of each unique label in the predicted labels
unique_labels, counts = np.unique(predicted_labels, return_counts=True)

# Create a bar chart to visualize the distribution of predicted labels
plt.figure(figsize=(8, 6))
plt.bar(unique_labels, counts, color='skyblue', edgecolor='black')

# Add titles and labels
plt.title('Distribution of Predicted Disease Labels', fontsize=16)
plt.xlabel('Disease Category', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

# Show the plot
plt.show()
