**Removing extra columns**

In [3]:
#Loading the data into a dataframe & clean

import csv
import pandas as pd
import matplotlib.pyplot as plt

#Importing a limited section of the dataset for now for easier processing
df = pd.read_csv("poisonous_mushrooms.csv", nrows=10000)
# Source: https://www.kaggle.com/datasets/davinascimento/poisonous-mushrooms?resource=download
# this assumes that you have the csv downloaded and stored in the same directory as this file

def remapPoisonousClass(dataframe):
	# Map poisionous or not to an integer
	mapping = {'p': 1, 'e': 0}
	df_t = dataframe["class"].map(mapping)

	# Rename target feature to "poisionous" in the dataframe
	df_t.rename({"class": "poisonous"})
	dataframe["poisonous"] = df_t
	return dataframe

def cleanNum(x):
	x = str(x)
	if(any(char.isdigit() for char in x)):
		return "unknown"
	else:
		return x

def cleanData(dataframe):
	# Iterates through the columns, prints out counts of each data for each column
	for column in dataframe:
		df_series = dataframe[column].to_frame()
		# missing categorical values are replaced with the string "unknown"; numerical data is removed and replaced with unknown
		if (not pd.api.types.is_numeric_dtype(df_series.dtypes[0])):
			df_series = df_series.fillna(value="unknown")
			
			df_series = df_series.applymap(cleanNum)
			dataframe[column] = df_series.astype(str)
		# missing continous values are replaced with a mean
		else:  
			df_series = df_series.fillna(value=df_series.mean())
			dataframe[column] = df_series

	print(dataframe)

	return dataframe


savedCols = ["poisonous"]
savedFeatures = ["cap-surface", "cap-color", "gill-attachment", "gill-color", "stem-width", "stem-color"]
savedCols = savedCols + savedFeatures

print(pd.api.types.is_numeric_dtype(df["class"].dtype))
#currentlyIgnoring = ["cap-diameter", "stem-height", "season"]
#droppedCols = ["id", "class", "cap-shape", "does-bruise-or-bleed", "gill-spacing", "stem-root", "stem-surface", "veil-type", 
#	"veil-color", "ring-type", "spore-print-color", "habitat", "has-ring"]

print ("Remapping poisonous class...")

df = remapPoisonousClass(df)

# Remove any columns we're choosing to ignore
for col in df.columns.to_list():
	if (col in savedCols):
		continue
	else:
		print("dropping " + col)
		df = df.drop(col, axis=1)

print("Remaining df:")
print(df)

print("Cleaning dataframe...")

df = cleanData(df)

False
Remapping poisonous class...
dropping id
dropping class
dropping cap-diameter
dropping cap-shape
dropping does-bruise-or-bleed
dropping gill-spacing
dropping stem-height
dropping stem-root
dropping stem-surface
dropping veil-type
dropping veil-color
dropping has-ring
dropping ring-type
dropping spore-print-color
dropping habitat
dropping season
Remaining df:
     cap-surface cap-color gill-attachment gill-color  stem-width stem-color  \
0              s         u               a          w       15.39          w   
1              h         o               a          n        6.48          o   
2              s         b               x          w        9.93          n   
3              y         g               s          g        6.53          w   
4              l         w               d          w        8.36          w   
...          ...       ...             ...        ...         ...        ...   
9995           g         y               d          n        2.58        

**One hot encoding**

This next phase sets up one hot encoding for our data for the logisitic model. Its way over complicated, and isn't finished as of yet. The goal was to pick which values to pick by multiplying the frequency with which the specific sample occurs to absolute value of the poison rate - 0.5 (in other words, trying to see how important this observation has for determining whether a mushroom is poisonous or not)

Probably will revert to taking the most common one...

In [37]:
class FeaturePair():
	def __init__(self, feature, float):
		self.featureName = feature
		self.float = float
	
	def featureName(self):
		return self.featureName
	
	def value(self):
		return self.float
	
	def __str__(self):
		return self.featureName
	
	def __repr__(self):
		return self.__str__()
	
	#@staticmethod
	#def sort(lhs:FeaturePair, rhs:FeaturePair):
	#	return lhs.value < rhs.value
def findObservationFrequency(dFrame:pd.DataFrame, feature):
	ser = dFrame[feature].value_counts(ascending=False)
	
	featureValues = dFrame[feature].unique()
	featurePairs = []

	for val in featureValues:
		num = ser.loc[val]
		featurePairs.append(FeaturePair(val, num))
	
	return featurePairs


def encodeFeatureIntoNgroups(dFrame:pd.DataFrame, feature, numEncodings:int, prints=False):
	numfeatureVals = dFrame[feature].nunique()
	if(numEncodings <= numfeatureVals):
		return dFrame[feature].unique()
	
	# If there are enough different values to do so, pick the "best" n of the values to use
	features = [feature, "poisonous"]
	df_T = df[features]
	df_T = df_T.groupby(feature)["poisonous"].value_counts(normalize=True).sort_index()
	df_T = df_T.to_frame().reset_index()
	if (prints):
		print(df_T.head())

	featureVals = dFrame[feature].unique()
	
	if (prints):
		print(featureVals)

	ediblePercentsFeaturePairs = []

	for val in featureVals:
		if(prints):
			print(val)
		valRows = df_T.loc[(df_T[feature] == val) & (df_T["poisonous"] == 0)]
		valRows.reset_index()
		ediblePercent = 0
		if(prints):
			print(valRows)
		if (valRows.shape[0] == 0):
			ediblePercent = 1
		elif (valRows.shape[0] == 1):
			ediblePercent = valRows["proportion"].iloc[0]
		else:
			print("Found several rows for value "+val+"")
			ediblePercent = valRows["proportion"].iloc[0]

		absEdiblePercent = abs(ediblePercent - 0.5)
		pair = FeaturePair(val, absEdiblePercent)
		ediblePercentsFeaturePairs.append(pair)

	
	if (prints):
		print(sortedFeaturePairs)
	
	obsFreqFeaturePairs = findObservationFrequency(dFrame, feature)

	edibleFPs = sorted(ediblePercentsFeaturePairs.append(pair), key=lambda entry: entry.featureName())
	freqFPs = sorted(obsFreqFeaturePairs, key=lambda entry: entry.featureName())

	print("Probability rates:")
	print(edibleFPs)
	print("Frequencies:")
	print(freqFPs)

	adjustFPs = []

	for i in range(numfeatureVals):
		entryName = features[i]
		entryValue = freqFPs[i].value() * edibleFPs[i].value()
		adjustFPs.append(FeaturePair(entryName, entryValue))
		

	sortedFeaturePairs = sorted(adjustFPs, key=lambda entry: entry.value())
	
	while len(sortedFeaturePairs) > numEncodings:
		sortedFeaturePairs.pop()





df_one_hot = pd.get_dummies(df["cap-color"], dtype=int)
print(df_one_hot.head())
print(df["cap-color"].value_counts(ascending=False))
results = encodeFeatureIntoNgroups(df, "cap-color", 5, prints=True)
print(results)




"""
def countFeatureAndPoisonousCases(df, feature, printTable=False):
	features = [feature, "poisonous"]
	df_T = df[features]
	df_T = df_T.groupby(feature)["poisonous"].value_counts().sort_index()
	df_T = df_T.to_frame().reset_index()
	if printTable:
		print(df_T)
	#sns.barplot(x=df_T[feature], y=df_T["count"], hue=df_T["poisonous"])
	#plt.show()

countFeatureAndPoisonousCases(df, "cap-shape", printTable=True)

for attr in ["cap-shape", "cap-surface", "cap-color", "stem-color", "stem-surface", "gill-attachment","gill-color", "does-bruise-or-bleed", "season"]:
	countFeatureAndPoisonousCases(df, attr, printTable=True)"""

   b  e  g  k  l  n  o  p  r  u  w  y
0  0  0  0  0  0  0  0  0  0  1  0  0
1  0  0  0  0  0  0  1  0  0  0  0  0
2  1  0  0  0  0  0  0  0  0  0  0  0
3  0  0  1  0  0  0  0  0  0  0  0  0
4  0  0  0  0  0  0  0  0  0  0  1  0
cap-color
n    4371
y    1223
w    1198
g     723
e     646
o     543
p     311
u     249
r     232
k     212
b     181
l     111
Name: count, dtype: int64
['u' 'o' 'b' 'g' 'w' 'n' 'e' 'y' 'r' 'p' 'k' 'l']


'\ndef countFeatureAndPoisonousCases(df, feature, printTable=False):\n\tfeatures = [feature, "poisonous"]\n\tdf_T = df[features]\n\tdf_T = df_T.groupby(feature)["poisonous"].value_counts().sort_index()\n\tdf_T = df_T.to_frame().reset_index()\n\tif printTable:\n\t\tprint(df_T)\n\t#sns.barplot(x=df_T[feature], y=df_T["count"], hue=df_T["poisonous"])\n\t#plt.show()\n\ncountFeatureAndPoisonousCases(df, "cap-shape", printTable=True)\n\nfor attr in ["cap-shape", "cap-surface", "cap-color", "stem-color", "stem-surface", "gill-attachment","gill-color", "does-bruise-or-bleed", "season"]:\n\tcountFeatureAndPoisonousCases(df, attr, printTable=True)'