In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from utils import plot_kfold_subplot
from pycm import ConfusionMatrix

plt.rcParams.update({'figure.figsize': (15,15)})
SEED = 199510
np.random.seed(SEED)

map_7_classes = {
    0: 'normal_superficiel',
    1: 'normal_intermediate',
    2: 'normal_columnar',
    3: 'light_dysplastic',
    4: 'moderate_dysplastic',
    5: 'severe_dysplastic',
    6: 'carcinoma_in_situ'
}

map_2_classes = {
    0: 'normal',
    1: 'anormal'
}


In [None]:
df = pd.read_csv('smear2005/dataset.csv')
dataset = df.drop(["Class", "ID", "Class_cat_7", "Class_num_7", "Class_cat_2", "Class_num_2"], axis=1)

In [None]:
KFOLDS = 5

df_kfold = df.sort_values(by=[f"Class_num_7"])

kf = KFold(n_splits=KFOLDS)

skf = StratifiedKFold(n_splits=KFOLDS)

plot_kfold_subplot(
    kf,
    skf,
    df_kfold["ID"],
    df_kfold["Class_cat_7"],
    n_splits=KFOLDS,
    lw=20,
    figsize=(15, 10),
)


In [None]:
X = dataset.values
y = df["Class_cat_2"].values
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('MLP1', MLPClassifier()))
models.append(('MLP2', MLPClassifier(hidden_layer_sizes=(100,100))))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = StratifiedKFold(n_splits=10, random_state=SEED, shuffle=True)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print(f"{name}: {cv_results.mean()} +/- ({cv_results.std()})")
	

In [None]:
plt.figure(figsize=(15,10))
plt.boxplot(results, labels=names)
plt.title('Comparativa de algoritmos')
plt.show()

In [None]:
model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)

In [None]:
df_pred = pd.DataFrame({"Real": Y_validation, "Pred": predictions})
df_pred = df_pred.replace(map_2_classes)
df_error = df_pred[df_pred["Real"] != df_pred["Pred"]]
print(len(df_error.index))

In [None]:
df_error.groupby("Pred").size()

In [None]:
cm = ConfusionMatrix(actual_vector=Y_validation, predict_vector=predictions)
print(cm)

In [None]:
cm.plot(number_label=True);

In [None]:
cm.plot(number_label=True, normalized=True);

In [None]:
X = dataset.values
y = df["Class_cat_7"].values
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1, stratify=y)

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('MLP1', MLPClassifier()))
models.append(('MLP2', MLPClassifier(hidden_layer_sizes=(100,100))))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = StratifiedKFold(n_splits=10, random_state=SEED, shuffle=True)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print(f"{name}: {cv_results.mean()} +/- ({cv_results.std()})")
	

In [None]:
plt.figure(figsize=(15,10))
plt.boxplot(results, labels=names)
plt.title('Comparativa de algoritmos')
plt.show()

In [None]:
model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)

In [None]:
df_pred = pd.DataFrame({"Real": Y_validation, "Pred": predictions})
df_pred = df_pred.replace(map_2_classes)
df_error = df_pred[df_pred["Real"] != df_pred["Pred"]]
print(len(df_error.index))

In [None]:
df_error.groupby("Pred").size()

In [None]:
cm = ConfusionMatrix(actual_vector=Y_validation, predict_vector=predictions)
print(cm)

In [None]:
cm.plot(number_label=True);

In [None]:
cm.plot(number_label=True, normalized=True);