In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv('covid_toy.csv', usecols=['age','fever', 'has_covid'])

In [4]:
df.sample()

Unnamed: 0,age,fever,has_covid
31,83,103.0,No


In [7]:
df.isnull().sum()

age          0
fever        0
has_covid    0
dtype: int64

In [6]:
df.fillna(df['fever'].mean(),inplace=True)

In [20]:
x = df.iloc[:,0:2]
y = df.iloc[:,2:]

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3, random_state=42)

In [24]:
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

accuracy_score(y_test, y_pred)

0.4666666666666667

In [28]:
np.mean(cross_val_score(DecisionTreeClassifier(),x,y, cv=20,scoring='accuracy'))

np.float64(0.53)

In [32]:
kbins_age = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='quantile')
kbins_fever = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='quantile')

In [33]:
trf = ColumnTransformer([
    ('bins_age',kbins_age, [0]),
    ('bins_fever', kbins_fever, [1])
])

In [34]:
x_train_trf = trf.fit_transform(x_train)
x_test_trf = trf.transform(x_test)

In [35]:
clf = DecisionTreeClassifier()
clf.fit(x_train_trf, y_train)

y_pred2 = clf.predict(x_test_trf)

accuracy_score(y_test, y_pred2)

0.5333333333333333

In [37]:
x_trf = trf.fit_transform(x)

In [38]:
np.mean(cross_val_score(DecisionTreeClassifier(), x_trf, y, cv=10, scoring='accuracy'))

np.float64(0.47000000000000003)