# Project 5 Naive Bayes Classifier

- [Report](https://docs.google.com/document/d/1U_K9n-OSwi6Ld92fMkoIm5cEyvnQTJYjKHeozIQIst8/edit?usp=sharing)
- [Slides](https://docs.google.com/presentation/d/11TlmsBK_qryIzCE8YAoY3G1Eu9zstGp0IyQpT3e0LdY/edit?usp=sharing)
- [Dataset](https://archive.ics.uci.edu/dataset/73/mushroom)

## Setup

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

## Analysis

In [None]:
features = [
    'class', 
    'cap_shape', 
    'cap_surface',
    'cap_color',
    'bruises',
    'odor',
    'gill_attachment',
    'gill_spacing',
    'gill_size',
    'gill_color',
    'stalk_shape',
    'stalk_root',
    'stalk_surface_ar',
    'stalk_surface_br',
    'stalk_color_ar',
    'stalk_color_br',
    'veil_type',
    'veil_color',
    'ring_number',
    'ring_type',
    'spore_print_color',
    'population',
    'habitat'
]
df = pd.read_csv('data/agaricus-lepiota.data', names=features)
df

### Encoding the data for use in Naive Bayes

In [None]:
# Convert values into numerical values
labelEncoders = {}
for col in df.columns:
    labelEncoders[col] = LabelEncoder()
    df[col] = labelEncoders[col].fit_transform(df[col])
df

In [None]:
encodings = {}  # encoding pairs for reference
for col, encoder in labelEncoders.items():
    encodings[col] = {encoded: original for encoded, original in enumerate(encoder.classes_)}
encodings

### Practicing Naive Bayes

In [None]:
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

classifier = MultinomialNB()

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

p, r, f, s = precision_recall_fscore_support(y_test, y_pred, labels=[0, 1])
print(p, r, f, s)
print(classification_report(y_test, y_pred))

In [None]:
mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')

### A Methodical Approach to Finding the Best Attributes for Prediction

In [None]:
def testDataset(dataframe, testCol, attributes=1, verbose=True):
    df = dataframe.copy()

    # Test a random set of attributes
    if type(attributes) == int:
        # If more attributes are specified than there are, just use all of them
        if attributes > len(df.columns)-1:
            attributes = len(df.columns)-1
        attributes = df[df.columns[df.columns != testCol]].sample(axis=1, n=attributes).columns
        attributes = [attribute for attribute in attributes]
        # display([attribute for attribute in attributes])

    # Test a specific set of attributes
    elif type(attributes) != list:
        print('attributes must be an integer or a list of attribute names')
        return


    # Clear out any records that don't have a valid value for one of the attributes in question
    for attribute in attributes:
        if attribute == testCol:
            print(f'Cannot predict {testCol} using {testCol}')
            return
        
        try:
            df = df.dropna(subset=attribute)
        except Exception as e:
            print(e)
            return
    
    print(f'Predicting based on Naive Bayes Classifiers using {attributes}:') if verbose else None
    # display(df)

    f1Scores = []
    for j in range(10):
        X = df[attributes]
        y = df['class']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

        classifier = MultinomialNB()

        classifier.fit(X_train, y_train)

        y_pred = classifier.predict(X_test)

        p, r, f, s = precision_recall_fscore_support(y_test, y_pred, labels=[0, 1])
        f1Scores.append(f)
        print(f'Test {j}: precision={p}, recall={r}, f-score={f}, support={s}') if verbose else None
        print(classification_report(y_test, y_pred)) if verbose else None
        

    meanF1s = []
    for i in range(len(f1Scores[0])):
        f1Total = 0
        for score in f1Scores:
            f1Total += score[i]
        meanF1s.append(f1Total / len(f1Scores))

    print(f'Mean F1 scores: f-score={meanF1s}') if verbose else None

    f1sTotal = 0
    for f1 in meanF1s:
        f1sTotal += f1
    meanOfF1s = f1sTotal / len(meanF1s)
    meanF1s.append(meanOfF1s)

    print(f'Mean of mean F1 scores: f-score={meanF1s[-1]}') if verbose else None

    return meanF1s, attributes

#### Testing Average and Highest F-scores for Each Number of Attributes

In [None]:
fScores = {
    'highestFScores': [],
    'avgFScores': [],
    'winningAttributes': [],
    'attributes': []
}
for i in range(1, len(df.drop('class', axis=1).columns)+1):
    fScores['attributes'].append(i)
    avgFScore = 0
    highestF = 0
    bestAttributes = []
    for j in range(50):
        scores, attributes = testDataset(dataframe=df, testCol='class', attributes=i, verbose=False)
        avgFScore += scores[2]
        if scores[2] > highestF:
            highestF = scores[2]
            bestAttributes = attributes
    avgFScore /= 50
    fScores['avgFScores'].append(avgFScore)
    fScores['highestFScores'].append(highestF)
    fScores['winningAttributes'].append(bestAttributes)

display(fScores)

In [None]:
scoresDF = pd.DataFrame(fScores)
scoresDF = scoresDF.sort_values(by='attributes')
display(scoresDF)

sns.lineplot(x='attributes', y='avgFScores', data=scoresDF)
plt.xlabel('Number of Attributes')
plt.ylabel('Average F-score')
plt.title('Average F-score of Predictions vs Number of Attributes Used')

#### Testing highest fscores of each number of attributes

In [None]:
# highestFScores = {
#     'fScores': [],
#     'winningAttributes': [],
#     'attributes': []
# }
# for i in range(1, len(df.drop('class', axis=1).columns)+1):
#     highestFScores['attributes'].append(i)
#     highestF = 0
#     bestAttributes = []
#     for j in range(50):
#         scores, attributes = testDataset(dataframe=df, testCol='class', attributes=i, verbose=False)
#         if scores[2] > highestF:
#             highestF = scores[2]
#             bestAttributes = attributes
#     highestFScores['fScores'].append(highestF)
#     highestFScores['winningAttributes'].append(bestAttributes)

# display(highestFScores)

In [None]:
sns.lineplot(x='attributes', y='highestFScores', data=scoresDF)
plt.xlabel('Number of Attributes')
plt.ylabel('Highest F-score')
plt.title('Highest F-score of Predictions vs Number of Attributes Used')

#### Plotting Both

In [None]:
x = scoresDF['attributes']
y1 = scoresDF['highestFScores']
y2 = scoresDF['avgFScores']

fig, ax1 = plt.subplots()

ax1.plot(x, y1, color='tab:blue')
ax1.set_ylabel('Highest F-score', color='tab:blue')

ax1.set_ylim(min(min(y1), min(y2)), 1)

ax2 = ax1.twinx()
ax2.plot(x, y2, color='tab:red')
ax2.set_ylabel('Average F-score', color='tab:red')

ax2.set_ylim(min(min(y1), min(y2)), 1)

ax1.set_xlabel('Number of Attributes')
ax1.set_title('Highest F-score and Average F-score of Predictions vs Number of Attributes Used')

plt.show()

### Narrowing Down the Best Combination

In [None]:
testDataset(df, 'class', ['stalk_surface_ar',
   'gill_spacing',
   'bruises',
   'cap_color',
   'gill_attachment',
   'gill_size'], verbose=False)

In [None]:
testDataset(df, 'class', ['cap_color',
  'bruises',
  'stalk_surface_ar',
  'gill_attachment',
  'gill_size',
  'gill_spacing',
  'stalk_surface_br'], verbose=False)

In [None]:
testDataset(df, 'class', ['gill_size',
  'stalk_shape',
  'gill_attachment',
  'veil_color',
  'ring_number',
  'gill_spacing',
  'cap_color',
  'bruises'], verbose=False)

In [None]:
testDataset(df, 'class', ['gill_size',
  'veil_type',
  'gill_spacing',
  'gill_attachment',
  'bruises',
  'stalk_color_br',
  'stalk_color_ar',
  'veil_color',
  'cap_color'], verbose=False)

In [None]:
testDataset(df, 'class', ['gill_attachment',
  'cap_shape',
  'stalk_surface_ar',
  'stalk_color_br',
  'odor',
  'bruises',
  'stalk_shape',
  'stalk_color_ar',
  'gill_size',
  'ring_number',
  'gill_spacing'], verbose=False)

In [None]:
topContenders = [
    ['stalk_surface_ar',
   'gill_spacing',
   'bruises',
   'cap_color',
   'gill_attachment',
   'gill_size'],
   ['cap_color',
  'bruises',
  'stalk_surface_ar',
  'gill_attachment',
  'gill_size',
  'gill_spacing',
  'stalk_surface_br'],
  ['gill_size',
  'stalk_shape',
  'gill_attachment',
  'veil_color',
  'ring_number',
  'gill_spacing',
  'cap_color',
  'bruises'],
  ['gill_size',
  'veil_type',
  'gill_spacing',
  'gill_attachment',
  'bruises',
  'stalk_color_br',
  'stalk_color_ar',
  'veil_color',
  'cap_color'],
  ['gill_attachment',
  'cap_shape',
  'stalk_surface_ar',
  'stalk_color_br',
  'odor',
  'bruises',
  'stalk_shape',
  'stalk_color_ar',
  'gill_size',
  'ring_number',
  'gill_spacing']
]
top = 0
topAttributes = []
for combo in topContenders:
    avgFScore = 0
    for i in range(50):
        s, a = testDataset(df, 'class', combo, verbose=False)
        avgFScore += s[2]
    avgFScore = avgFScore / 50
    if avgFScore > top:
        top = avgFScore
        topAttributes = a

print(f'The best attributes to use for predicting the edibility of mushrooms are {topAttributes}, which yield an average F-score of about {top}')

### Performance of the Best Combination

In [None]:
X = df[topAttributes]
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

classifier = MultinomialNB()

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

p, r, f, s = precision_recall_fscore_support(y_test, y_pred, labels=[0, 1])
print(p, r, f, s)
print(classification_report(y_test, y_pred))

In [None]:
mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label')

### Finding Attributes That Are Good Predictors by Themselves

In [None]:
commonFeatures = {}
for feature in features:
    commonFeatures[feature] = 0

for i in range(1, len(df.drop('class', axis=1).columns)+1):
    for j in range(50):
        scores, attributes = testDataset(dataframe=df, testCol='class', attributes=i, verbose=False)
        if scores[2] > 0.85:
            for a in attributes:
                commonFeatures[a] += 1

display(commonFeatures)

In [None]:
data = {
    'attribute': [],
    'count': []
}
for key, value in commonFeatures.items():
    data['attribute'].append(key)
    data['count'].append(value)

attributeCountsDF = pd.DataFrame(data)
attributeCountsDF = attributeCountsDF[attributeCountsDF.attribute != 'class']
attributeCountsDF = attributeCountsDF.sort_values(by='count')
display(attributeCountsDF)

sns.barplot(x='attribute', y='count', data=attributeCountsDF)
plt.xticks(rotation=90)
plt.xlabel('Attribute')
plt.ylabel('Count')
plt.title('Count per Attribute of Appearances in Attribute Combinations Yielding an F-score > 0.85')
fig = plt.gcf()
fig.set_size_inches(10, 5)

#### A Look at a Few of Them

In [None]:
bruisesDF = df.groupby(['bruises', 'class'])['habitat'].count().reset_index()
bruisesDF['bruises'] = bruisesDF['bruises'].map(encodings['bruises'])
bruisesDF['class'] = bruisesDF['class'].map(encodings['class'])

sns.barplot(x='bruises', y='habitat', hue='class', data=bruisesDF)
plt.xlabel('Has Bruises')
plt.ylabel('Count')
plt.title('Count of Mushrooms vs Presence of Bruises')

In [None]:
bruisesDF = df.groupby(['bruises', 'class'])['habitat'].count().reset_index()
bruisesDF['bruises'] = bruisesDF['bruises'].map(encodings['bruises'])
bruisesDF['class'] = bruisesDF['class'].map(encodings['class'])

sns.barplot(x='class', y='habitat', hue='bruises', data=bruisesDF)
plt.xlabel('Edibility')
plt.ylabel('Count')
plt.title('Count of Mushrooms vs Edibility')

In [None]:
gillSpacingDF = df.groupby(['gill_spacing', 'class'])['habitat'].count().reset_index()
gillSpacingDF['gill_spacing'] = gillSpacingDF['gill_spacing'].map(encodings['gill_spacing'])
gillSpacingDF['class'] = gillSpacingDF['class'].map(encodings['class'])

sns.barplot(x='gill_spacing', y='habitat', hue='class', data=gillSpacingDF)
plt.xlabel('Gill Spacing')
plt.ylabel('Count')
plt.title('Count of Mushrooms vs Gill Spacing')

In [None]:
gillSpacingDF = df.groupby(['gill_spacing', 'class'])['habitat'].count().reset_index()
gillSpacingDF['gill_spacing'] = gillSpacingDF['gill_spacing'].map(encodings['gill_spacing'])
gillSpacingDF['class'] = gillSpacingDF['class'].map(encodings['class'])

sns.barplot(x='class', y='habitat', hue='gill_spacing', data=gillSpacingDF)
plt.xlabel('Edibility')
plt.ylabel('Count')
plt.title('Count of Mushrooms vs Edibility')

In [None]:
populationDF = df.groupby(['population', 'class'])['habitat'].count().reset_index()
populationDF['population'] = populationDF['population'].map(encodings['population'])
populationDF['class'] = populationDF['class'].map(encodings['class'])

sns.barplot(x='population', y='habitat', hue='class', data=populationDF)
plt.xlabel('Population')
plt.ylabel('Count')
plt.title('Count of Mushrooms vs Population')

In [None]:
populationDF = df.groupby(['population', 'class'])['habitat'].count().reset_index()
populationDF['population'] = populationDF['population'].map(encodings['population'])
populationDF['class'] = populationDF['class'].map(encodings['class'])

sns.barplot(x='class', y='habitat', hue='population', data=populationDF)
plt.xlabel('Edibility')
plt.ylabel('Count')
plt.title('Count of Mushrooms vs Edibility')