In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [1]:
df = pd.read_csv("/content/drive/MyDrive/alzheimer/alzheimer_gs.csv")

data = df.iloc[:,:-2]
data = data.drop('Unnamed: 0', axis=1)

label = df['label']
experiment_label = df['ds']

In [None]:
len_label = len(np.unique(label))
len_experiment = len(np.unique(experiment_label))
class_count = len_label * len_experiment

In [2]:
# MY TOOLS

class reduction:
  
  def __init__(self,
               train_data,
               dimension):
    
    self.train_data = train_data
    self.dimension = dimension

  def pca(self):
    from sklearn.decomposition import PCA

    pca = PCA(n_components=self.dimension)
    reducted_data = pca.fit_transform(self.train_data)

    return reducted_data

  def kernel_pca(self,
                 kernel):
    
    from sklearn.decomposition import KernelPCA

    k_pca = KernelPCA(n_components=self.dimension, kernel=kernel)
    reducted_data = k_pca.fit_transform(self.train_data)

    return reducted_data

  def svd(self):
    from sklearn.decomposition import TruncatedSVD

    svd = TruncatedSVD(n_components=self.dimension)
    reducted_data = svd.fit_transform(self.train_data)

    return reducted_data

  def umap(self,
           neighbors,
           metric):
    
    try:
      import umap
    except ImportError:
      !pip install umap-learn
      import umap

    umap = umap.UMAP(n_neighbors=neighbors, n_components=self.dimension, metric=metric)
    embedding = umap.fit_transform(self.train_data)

    return embedding



class metrics:

  def __init__(self,
               data,
               predicted_labels):
    
    self.data = data
    self.predicted_labels = predicted_labels

  def silhouette(self):
    from sklearn.metrics import silhouette_score

    acc_score = silhouette_score(self.data, self.predicted_labels, metric='euclidean')
    
    return acc_score

  def f1_score(self):
    from sklearn.metrics import f1_score

    acc_score = f1_score(self.data, self.predicted_labels)

    return acc_score

  def roc_auc(self,
              predicted_prob,
              multiclass):
    
    from sklearn.metrics import roc_auc_score

    if multiclass:
      acc_score = roc_auc_score(self.data, predicted_prob, multiclass='ovr')

    else: 
      acc_score = roc_auc_score(self.data, predicted_prob)

    return acc_score

  def simple_acc(self):
    from sklearn.metrics import accuracy_score

    acc_score = accuracy_score(self.data, self.predicted_labels)

    return acc_score

  def confusion_matrix(self):
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(data, self.predicted_labels)

    return cm



class algorithms:

  def __init__(self,
               train_data,
               train_labels=None,
               test_data=None,
               test_labels=None):
    
    self.random_state = 42
    self.train_data = train_data
    self.train_labels = train_labels
    self.test_data = test_data
    self.test_labels = test_labels

  def kmeans(self,
             class_count):
    
    model = {}
    
    from sklearn.cluster import KMeans

    kmeans = KMeans(n_clusters=class_count, random_state=self.random_state, n_init=20)
    kmeans.fit(self.train_data)

    acc = metrics(self.train_data, kmeans.labels_)
    acc_score = acc.silhouette()

    model['model'] = kmeans
    model['labels'] = kmeans.labels_
    model['acc'] = acc_score

    return model

  def dbscan(self,
             eps,
             min_samples):
    
    model = {}

    from sklearn.cluster import DBSCAN

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(self.train_data)

    acc = metrics(self.train_data, dbscan.labels_)
    acc_score = acc.silhouette()

    model['model'] = dbscan
    model['labels'] = dbscan.labels_
    model['acc'] = acc_score

    return model

  def svm(self,
          kernel,
          c):
    
    model = {}
    
    from sklearn.svm import SVC

    svm = SVC(kernel=kernel, C=c, random_state=self.random_state)
    svm.fit(self.train_data, self.train_labels)
    predictions = svm.predict(self.test_data)

    acc = metrics(self.test_labels, predictions)
    acc_score = acc.simple_acc()

    model['model'] = svm
    model['labels'] = predictions
    model['acc'] = acc_score

    return model

  def dtc(self):

    model = {}

    from sklearn.tree import DecisionTreeClassifier

    dtc = DecisionTreeClassifier()
    dtc.fit(self.train_data, self.train_labels)
    predictions = dtc.predict(self.test_data)

    acc = metrics(self.test_labels, predictions)
    acc_score = acc.simple_acc()

    model['model'] = dtc
    model['labels'] = predictions
    model['acc'] = acc_score

    return model

  def rf(self,
         n_estimators,
         max_depth,
         max_features):
    
    model = {}
    
    from sklearn.ensemble import RandomForestClassifier

    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features)
    rf.fit(self.train_data, self.train_labels)
    predictions = rf.predict(self.test_data)

    acc = metrics(self.test_labels, predictions)
    acc_score = acc.simple_acc()

    model['model'] = rf
    model['labels'] = predictions
    model['acc'] = acc_score

    return model

  def mlp(self,
          layers,
          activation_func,
          loss,
          optimizer,
          metric,
          epochs):
    
    model = {}
    
    try:
      from tensorflow.keras.models import Sequential
      from tensorflow.keras.layers import Dense
    except ImportError:
      !pip install tensorflow
      from tensorflow.keras.models import Sequential
      from tensorflow.keras.layers import Dense

    mlp = Sequential()
    for i in layers:
      mlp.add(Dense(layers[i], input_dim=self.train_data.shape[1], activation=activation_func))

    mlp.add(Dense(1, activation='sigmoid'))
    mlp.compile(loss=loss, optimizer=optimizer, metrics=metric)
    mlp.fit(self.train_data, self.train_labels, epochs=epochs)

    threshold = 0.5
    
    predictions = mlp.predict(self.test_data)
    predictions = np.where(predictions > threshold, 1, 0)
    prediction = np.squeeze(predictions)

    acc = metrics(self.test_labels, predictions)
    acc_score = acc.simple_acc()

    model['model'] = mlp
    model['labels'] = predictions
    model['acc'] = acc_score

    return model

  def pca(self,
          dimension):
    
    pca = reduction(self.train_data, dimension)

    return pca.pca()

  def kernel_pca(self,
                 dimension,
                 kernel):
    
    k_pca = reduction(self.train_data, dimension)

    return k_pca.kernel_pca(kernel)

  def svd(self,
          dimension):
    
    svd = reduction(self.train_data, dimension)

    return svd.svd()

  def umap(self,
           neighbors,
           dimension,
           metric):
    umap = reduction(self.train_data, dimension)
    
    return umap.umap(neighbors, metric)

In [None]:
reducer = algorithms(data)
PCA_data = reducer.pca(2)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(PCA_data, label, test_size=0.2, random_state=42)

algorithm = algorithms(train_data=X_train, train_labels=y_train, test_data=X_test, test_labels=y_test)
results = {}
layers = {
    '1':50,
    '2':50,
    '3':50
    }

In [None]:
svm = algorithm.svm('rbf', 2.1493544617380334)

In [None]:
dtc = algorithm.dtc()

In [None]:
rf = algorithm.rf(n_estimators=526,
                  max_depth=3,
                  max_features='sqrt')

In [3]:
mlp = algorithm.mlp(layers=layers,
                    activation_func='relu',
                    loss='binary_crossentropy',
                    optimizer='adam',
                    metric=['accuracy'],
                    epochs=100)

In [4]:
algorithm_names = ['SVM', 'DTC', 'RF', 'MLP']
scores = [svm['acc'], dtc['acc'], rf['acc'], mlp['acc']]

colors = ['#cd853f', '#c1cdc1', '#4a708b', '#ff6a6a']

fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(algorithm_names, scores, color=colors)

percentage_scores = [str(int(score*100)) + "%" for score in scores]

for i, v in enumerate(scores):
    ax.text(i, v+0.01, percentage_scores[i], ha='center', fontsize=12)

ax.set_ylabel('Accuracy')
ax.set_title('Classification Algorithm Comparison')
plt.show()

In [5]:
reducer = algorithms(train_data=data)
embedding = reducer.umap(80, 15, 'euclidean')

In [None]:
cluster = algorithms(embedding)
labels = {}
samples_count = embedding.shape[0]

min_samples = int(samples_count / class_count)
kmeans = cluster.kmeans(2)
dbscan = cluster.dbscan(5, min_samples)

In [6]:
plt.scatter(embedding[:, 0], embedding[:, 1], c=kmeans['labels'], s=50, cmap='plasma')
plt.title(f"UMAP KMEANS Score : {kmeans['acc']}")
plt.show()

plt.scatter(embedding[:, 0], embedding[:, 1], c=dbscan['labels'], s=50, cmap='plasma')
plt.title(f"UMAP DBSCAN Score : {dbscan['acc']}")
plt.show()

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector as SFS
import seaborn as sns

results = []

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=42)

dtc_model1 = DecisionTreeClassifier()
dtc_model2 = DecisionTreeClassifier()
dtc_model3 = DecisionTreeClassifier()
svm_model = SVC(kernel='rbf')

In [7]:
# Data dimensionally reduced with UMAP gets an acc score of 0.66

rfe = RFE(estimator=dtc_model1, n_features_to_select=500)
X_rfe = rfe.fit_transform(X_train, y_train)
rfe_pred = rfe.predict(X_test)
results.append(rfe_pred)

score = metrics(y_test, rfe_pred)
acc = score.simple_acc()

print(acc)

In [8]:
# It gets an acc score of 0.60 on data without any dimension reduction.

dtc_model2.fit(X_train, y_train)
dtc_pred = dtc_model2.predict(X_test)

importances = dtc_model2.feature_importances_
sorted_indices = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_indices]

importances = []
count = 0

for i, val in enumerate(sorted_importances):
  if i<20 and val!=0.0:
    print(val)
    count += 1
    importances.append(val)
  if val==0.0:
    break

selected_cols = data.columns[sorted_indices[::-20][-count:]]

final_df = pd.DataFrame({'Features':selected_cols, 'Importances': importances})

score = metrics(y_test, dtc_pred)
acc = score.simple_acc()


plt.figure(figsize=(20,6))
plt.title(f"DTC Score : {acc}")
sns.barplot(x='Features', y='Importances', data=final_df)
plt.xticks(rotation=45)
plt.show()

In [9]:
# When trained with the df made with colon with the first 10 importance values from the previous trained model results, it gets 0.566 acc.

indices = [X_train.columns.get_loc(k) for k in selected_cols]
X_train_selected_cols = X_train.iloc[:, indices]

indicex = [X_test.columns.get_loc(k) for k in selected_cols]
X_test_selected_cols = X_test.iloc[:, indices]

dtc_model3.fit(X_train_selected_cols, y_train)
dtc_pred = dtc_model3.predict(X_test_selected_cols)

score = metrics(y_test, dtc_pred)
acc = score.simple_acc()
print(acc)

In [None]:
sfs_forward = SFS(svm_model, n_features_to_select=500, direction='forward', scoring='accuracy')
sfs_forward.fit(X_train, y_train)

selected_features = sfs_forward.get_support(True)

svm_model.fit(X_train[:, selected_features], y_train)
forward_selection_pred = svm_model.predict(X_test[:, selected_features])
results.append(forward_selection_pred)

In [None]:
sfs_backward = SFS(svm_model, n_features_to_select=3, direction='backward', scoring='accuracy')
sfs_backward.fit(X_train, y_train)

selected_features = sfs_forward.get_support(True)

svm_model.fit(X_train[:, selected_features], y_train)
backward_selection_pred = svm_model.predict(X_test[:, selected_features])
results.append(backward_selection_pred)

In [10]:
for i in results:
  score = metrics(y_test, i)
  acc = score.simple_acc()
  print(acc)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform, randint
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier

# result = {}
# result2 = {}

# svm_model = SVC()
# rf_model = RandomForestClassifier()

# params_grid_svm = {'kernel':('linear', 'rbf', 'polynomial'), 'C':[1, 5]}
# params_random_svm = {'C': uniform(loc=0, scale=5),
#               'kernel': ['linear', 'rbf', 'polynomial']}

# params_grid_rf = {
#     'n_estimators': [100, 300, 500],
#     'max_features': ['sqrt', 'log2'],
#     'max_depth': [10, 15, 20]
# }
# for i in range(10):
#   params_random_rf = {
#       'n_estimators': randint(150, 1000),
#   }
#   rnd = RandomizedSearchCV(rf_model, param_distributions=params_random_rf, n_iter=15)
#   rnd.fit(X_train, y_train)

#   result[i] = rnd.best_params_
#   result2[i] = rnd.best_score_

# params_grid_mlp = {'hidden_layer_sizes': [(10,), (50,), (100,)],
#                    'activation': ['logistic', 'tanh', 'relu'],
#                    }
# params_random_mlp = {'hidden_layer_sizes': [(10,), (50,), (100,)],
#                      'activation': ['logistic', 'tanh', 'relu'],}


# grd = GridSearchCV(svm_model, params_grid)
# grd.fit(X_train, y_train)

# grd = GridSearchCV(rf_model, params_grid_rf)
# grd.fit(X_train, y_train)

# rnd = RandomizedSearchCV(svm_model, param_distributions=params_random, n_iter=10)
# rnd.fit(X_train, y_train)

# rnd = RandomizedSearchCV(rf_model, param_distributions=params_random_rf, n_iter=15)
# rnd.fit(X_train, y_train)

In [None]:
# print("Best params with Grid Search: ", grd.best_params_)
# print("Best score with Grid Search: ", grd.best_score_)

# print("Best params with Random Search: ", rnd.best_params_)
# print("Best score with Random Search: ", rnd.best_score_)

# for i in result:
#   print(result[i])
#   print(result2[i])