**Removing extra columns**

In [1]:
#Loading the data into a dataframe & clean

import csv
import pandas as pd
import matplotlib.pyplot as plt

#Importing a limited section of the dataset for now for easier processing
df = pd.read_csv("poisonous_mushrooms.csv", nrows=10000)
# Source: https://www.kaggle.com/datasets/davinascimento/poisonous-mushrooms?resource=download
# this assumes that you have the csv downloaded and stored in the same directory as this file

def remapPoisonousClass(dataframe):
	# Map poisionous or not to an integer
	mapping = {'p': 1, 'e': 0}
	df_t = dataframe["class"].map(mapping)

	# Rename target feature to "poisionous" in the dataframe
	df_t.rename({"class": "poisonous"})
	dataframe["poisonous"] = df_t
	return dataframe

def dropExtraFeatures(dataframe, droppedCols):
	return dataframe.drop(droppedCols, axis=1)


def cleanNum(x):
	x = str(x)
	if(any(char.isdigit() for char in x)):
		return "unknown"
	else:
		return x

def cleanData(dataframe):
	categoryCols = ["cap-surface", "cap-color", "gill-attachment", "gill-color", "stem-color"]

	# Iterates through the columns, prints out counts of each data for each column
	for column in dataframe:
		df_series = dataframe[column].to_frame()
		print(df_series)
		# missing categorical values are replaced with the string "unknown"; numerical data is removed and replaced with unknown
		if (categoryCols.count(column) > 0):
			df_series = df_series.fillna(value="unknown")
			
			df_series = df_series.applymap(cleanNum)
			dataframe[column] = df_series.astype(str)
		# missing continous values are replaced with a mean
		else:  
			df_series = df_series.fillna(value=df_series.mean())
			dataframe[column] = df_series

	print(dataframe)

	return dataframe



currentlyIgnoring = ["cap-diameter", "stem-height", "season"]
droppedCols = ["id", "class", "cap-shape", "does-bruise-or-bleed", "gill-spacing", "stem-root", "stem-surface", "veil-type", 
	"veil-color", "ring-type", "spore-print-color", "habitat", "has-ring"]
droppedCols = droppedCols + currentlyIgnoring


print ("Remapping poisonous class...")

df = remapPoisonousClass(df)

for col in droppedCols:
	print("dropping " + col)
	df = df.drop(col, axis=1)

print("Remaining df:")
print(df)

print("Cleaning dataframe...")

df = cleanData(df)

Remapping poisonous class...
dropping id
dropping class
dropping cap-shape
dropping does-bruise-or-bleed
dropping gill-spacing
dropping stem-root
dropping stem-surface
dropping veil-type
dropping veil-color
dropping ring-type
dropping spore-print-color
dropping habitat
dropping has-ring
dropping cap-diameter
dropping stem-height
dropping season
Remaining df:
     cap-surface cap-color gill-attachment gill-color  stem-width stem-color  \
0              s         u               a          w       15.39          w   
1              h         o               a          n        6.48          o   
2              s         b               x          w        9.93          n   
3              y         g               s          g        6.53          w   
4              l         w               d          w        8.36          w   
...          ...       ...             ...        ...         ...        ...   
9995           g         y               d          n        2.58          n   

**Encoding features numerically**

Frankly, this data isn't a great fit for a logistic regression problem because most of the data is non-numeric. Our model will begin to correlate the encodings together. So this will (try) to mitigate that effect.

In [5]:

# not working yet
"""
prob_table = df['cap-color'].value_counts(normalize=True)
print(prob_table)







def countFeatureAndPoisonousCases(df, feature, printTable=False):
	features = [feature, "poisonous"]
	df_T = df[features]
	df_T = df_T.groupby(feature)["poisonous"].value_counts().sort_index()
	df_T = df_T.to_frame().reset_index()
	if printTable:
		print(df_T)
	#sns.barplot(x=df_T[feature], y=df_T["count"], hue=df_T["poisonous"])
	#plt.show()

countFeatureAndPoisonousCases(df, "cap-shape", printTable=True)

for attr in ["cap-shape", "cap-surface", "cap-color", "stem-color", "stem-surface", "gill-attachment","gill-color", "does-bruise-or-bleed", "season"]:
	countFeatureAndPoisonousCases(df, attr, printTable=True) """

cap-surface  cap-color  gill-attachment  gill-color  stem-width  stem-color  poisonous
h            n          a                n           1.46        n           1            0.0009
                                                     1.51        n           1            0.0008
                                                     1.66        n           1            0.0007
k            n          unknown          n           1.48        n           1            0.0006
                                                     1.50        n           1            0.0006
                                                                                           ...  
             y          unknown          o           13.29       y           1            0.0001
                        p                w           4.77        n           1            0.0001
                                         n           7.10        n           1            0.0001
                                        

KeyError: "['cap-shape'] not in index"