In [352]:
import numpy as np
import pandas as pd

In [353]:
data = pd.read_csv("mushrooms.csv")

#Preprocessing

In [354]:
x=data.drop(["class"],axis=1)
y=data["class"]

In [355]:
categorical_columns = [
    "cap-shape",
    "cap-surface",
    "cap-color",
    "bruises",
    "odor",
    "gill-attachment",
    "gill-spacing",
    "gill-size",
    "gill-color",
    "stalk-shape",
    "stalk-root",
    "stalk-surface-above-ring",
    "stalk-surface-below-ring",
    "stalk-color-above-ring",
    "stalk-color-below-ring",
    "veil-type",
    "veil-color",
    "ring-number",
    "ring-type",
    "spore-print-color",
    "population",
    "habitat"
]


In [356]:
from sklearn.preprocessing import LabelEncoder
for col in categorical_columns:
  le = LabelEncoder()
  x[col]=le.fit_transform(x[col])


In [357]:
y=le.fit_transform(y)

In [358]:
y

array([1, 0, 0, ..., 0, 1, 0])

In [359]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=97)

#Using the default hyperparameters

In [360]:
from sklearn import tree
tr = tree.DecisionTreeClassifier()


In [361]:
from sklearn.metrics import accuracy_score

In [362]:
tr.fit(x_train,y_train)
y_pred = tr.predict(x_test)
print("Accuracy score of the test data is "+str(accuracy_score(y_pred,y_test)))
print("Accuracy score of the training data is "+str(accuracy_score(y_train,tr.predict(x_train))))

Accuracy score of the test data is 1.0
Accuracy score of the training data is 1.0


In [363]:
#we see that using the default parameters results in accuracy of 100%

#Using other Hyperparameters

In [364]:
tr= tree.DecisionTreeClassifier(max_depth=3,criterion="entropy",splitter="random",min_samples_leaf=2)

In [365]:
tr.fit(x_train,y_train)
y_pred = tr.predict(x_test)
print("Accuracy score of the test data "+str(accuracy_score(y_pred,y_test)))
print("Accuracy score of the training data "+str(accuracy_score(y_train,tr.predict(x_train))))

Accuracy score of the test data 0.9635647464303299
Accuracy score of the training data 0.9668472017068768


In [366]:
#choosing different hyperparams reduces the accuracy of the model, the splitter should be "best" adn there should be no limitation on the max_depth for max accuracy
#also the min_samples_leaf should be 1 for this data

#Random Forest

In [367]:
data_e=pd.read_csv("weather_forecast_data.csv")

In [368]:
data_e.head()

Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure,Rain
0,23.720338,89.592641,7.335604,50.501694,1032.378759,rain
1,27.879734,46.489704,5.952484,4.990053,992.61419,no rain
2,25.069084,83.072843,1.371992,14.855784,1007.23162,no rain
3,23.62208,74.367758,7.050551,67.255282,982.632013,rain
4,20.59137,96.858822,4.643921,47.676444,980.825142,no rain


In [369]:
x=data_e.drop(["Rain"],axis=1)
y=data_e["Rain"]

In [370]:
y=le.fit_transform(y)

In [371]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=97)

In [372]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [373]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression


svc = SVC(probability=True)
log = LogisticRegression()
rfc= RandomForestClassifier()



In [374]:


for model in (svc,log,rfc):
  model.fit(x_train,y_train)
  y_pred=model.predict(x_test)
  print("Accuracy Score of model "+str(model) +" is "+str(accuracy_score(y_pred,y_test)))

Accuracy Score of model SVC(probability=True) is 0.976
Accuracy Score of model LogisticRegression() is 0.9296
Accuracy Score of model RandomForestClassifier() is 0.9984


In [375]:
#from the above output we see how Random Forest is really accurate more than Logistic and SVC

#Using different pipelines

In [376]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  MinMaxScaler, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

p1 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',MinMaxScaler()),
    ('model',DecisionTreeClassifier())]
)
p2 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler()),
    ('model',DecisionTreeClassifier(min_samples_leaf=3))]
)
p3 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',MinMaxScaler()),
    ('model',RandomForestClassifier())]
)
p4 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler()),
    ('model',RandomForestClassifier())]
)
p5 = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler()),
    ('model',RandomForestClassifier(n_estimators=50,criterion='entropy'))]
)
p1.fit(x_train,y_train)
p2.fit(x_train,y_train)
p3.fit(x_train,y_train)
p4.fit(x_train,y_train)
p5.fit(x_train,y_train)

In [378]:
print("Accurace of the first pipeline: "+str(accuracy_score(p1.predict(x_test),y_test)))
print("Accurace of the second pipeline: "+str(accuracy_score(p2.predict(x_test),y_test)))
print("Accurace of the third pipeline: "+str(accuracy_score(p3.predict(x_test),y_test)))
print("Accurace of the fourth pipeline: "+str(accuracy_score(p4.predict(x_test),y_test)))
print("Accurace of the fifth pipeline: "+str(accuracy_score(p5.predict(x_test),y_test)))

Accurace of the first pipeline: 0.9984
Accurace of the second pipeline: 0.9984
Accurace of the third pipeline: 0.9984
Accurace of the fourth pipeline: 0.9984
Accurace of the fifth pipeline: 0.9984
