In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
from preprocess_dental import preprocess_dental_data
from data_preprocess import preprocess

data = preprocess(for_modelling=True)
data.describe()

In [None]:
labels = ['TOTAL_SIMPLE_01','SERIOUS_01']#,'TOTAL_CARIES_COUNT']
dental_data = preprocess_dental_data(usage='01', drop_all_na=True, skipna=False, set_index=True)[labels]
dental_data.describe()

In [None]:
data = pd.merge(dental_data, data, left_index=True, right_index=True)
data.describe()

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler
Y = data[labels]
data = data.drop(columns=labels)
data.describe()

In [None]:
Y.describe()

In [None]:
categorical_cols = list(filter(lambda x: isinstance(data[x].dtype, pd.api.types.CategoricalDtype), data.columns))
numerical_cols = list(filter(lambda x: x not in categorical_cols, data.columns))

num_data = data[numerical_cols]


scaler = StandardScaler().fit_transform(num_data)
pca = PCA(n_components=3).fit_transform(scaler)

In [None]:
plt.pyplot.scatter(pca[:,0], pca[:,1], c=Y['TOTAL_SIMPLE_01'])

In [None]:
fig = plt.pyplot.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(pca[:,0], pca[:,1], pca[:,2], c=Y['SERIOUS_01'])
plt.pyplot.show()

In [None]:
cat_data = data[categorical_cols]
ohe = OneHotEncoder(sparse=False).fit(cat_data)


In [None]:
ohe

In [None]:
data = pd.concat([Y['SERIOUS_01'].reset_index(drop=True),pd.DataFrame(pca),pd.DataFrame(ohe)], axis=1)
data.describe()

In [None]:
from sklearn.model_selection import train_test_split
y = data['SERIOUS_01']
x = data.drop(columns=['SERIOUS_01'])
X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

In [None]:
from sklearn.metrics import f1_score
f1_score(Y_test, clf.predict(X_test))

In [None]:
clf.feature_importances_

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, Y_train)
svc.score(X_test, Y_test)

In [None]:
svc_2 = SVC(kernel='poly')
svc_2.fit(X_train, Y_train)
svc_2.score(X_test, Y_test)

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, Y_train)
qda.score(X_test, Y_test)

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator=clf)
ada.fit(X_train, Y_train)
ada.score(X_test, Y_test)