In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [81]:
df = pd.read_csv("../datasets/train_radiomics_hipocamp.csv")

In [74]:
# Contagem total de valores nulos no dataset
total_nulls = df.isnull().sum().sum()

# Contagem de colunas com valores nulos
columns_with_nulls_count = (df.isnull().sum() > 0).sum()

# Exibição
print(f"Total de valores nulos: {total_nulls}")
print(f"Colunas com valores nulos: {columns_with_nulls_count}")


Total de valores nulos: 0
Colunas com valores nulos: 0


In [43]:
# Definir os limites das faixas etárias com base nos quartis
bins = [55.3, 71.3, 75.0, 79.9, 91.0]
labels = [1, 2, 3, 4]  # Valores ordinais para cada faixa

# Criar a nova feature numérica com as faixas etárias
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=False, right=False)
df = df.drop(['Age'], axis=1)     

In [18]:
df['Age_Group'].value_counts()

Age_Group
2.0    77
3.0    76
0.0    76
1.0    75
Name: count, dtype: int64

In [40]:
df = df.loc[df['Transition'] != 'CN-MCI']

In [10]:
df['Transition'].value_counts()

Transition
CN-CN      96
MCI-MCI    71
MCI-AD     68
AD-AD      60
CN-MCI     10
Name: count, dtype: int64

In [26]:
df['Transition'] = df['Transition'].map({
    'CN-CN': 0,
    'CN-MCI': 1,
    'MCI-MCI': 2,
    'MCI-AD': 3,
    'AD-AD': 4
})

In [82]:
X = df.drop(['Transition'], axis=1)     
y = df['Transition'].to_frame()         

In [84]:
df_numerico = X.select_dtypes(include=[np.number])

In [85]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler
scaler = MinMaxScaler()
df_numerico = pd.DataFrame(scaler.fit_transform(df_numerico), columns=df_numerico.columns, index=df_numerico.index) #normalização

In [87]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=2022)
df_numerico, y = smote.fit_resample(df_numerico, y)

In [47]:
from sklearn.ensemble import RandomForestClassifier

# Treinar um modelo Random Forest
rf = RandomForestClassifier(
    n_estimators=500,               # Mais árvores para maior estabilidade
    max_depth=15,                   # Controla a complexidade, evitando overfitting
    min_samples_split=5,            # Divide nós apenas se houver 5 ou mais amostras
    min_samples_leaf=2,             # Cada folha deve ter pelo menos 2 amostras
    max_features='sqrt',            # Usa a raiz quadrada do número de features
    class_weight='balanced_subsample',  # Dá peso maior às classes minoritárias em cada árvore
    bootstrap=True,                 # Amostragem com reposição para robustez
    random_state=2022,                # Reprodutibilidade
)
rf.fit(df_numerico, y)

  return fit_method(estimator, *args, **kwargs)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Treinar um modelo Random Forest
model = RandomForestClassifier(random_state=2022)
model.fit(df_numerico, y)

# Obter as importâncias das features
feature_importances = pd.DataFrame({
    'Feature': df_numerico.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Mostrar as 10 features mais importantes
#print(feature_importances.head(10))

# Selecionar as n melhores features
n = 500
features_to_keep = feature_importances.iloc[:n]['Feature']

# Filtrar o DataFrame original para manter somente essas features
df_numerico = df_numerico[features_to_keep]

  return fit_method(estimator, *args, **kwargs)


In [8]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=6, random_state=2022)
dt.fit(df_numerico, y)

In [88]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=2022, solver='newton-cg')
lr.fit(df_numerico, y)

  y = column_or_1d(y, warn=True)


In [9]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(
    solver='saga',
    penalty='elasticnet',
    l1_ratio=0.3,  # Balanço entre L1 e L2
    C=0.5,         # Regularização moderada
    random_state=2022
)
model.fit(df_numerico, y)

  y = column_or_1d(y, warn=True)


In [89]:
X_test = pd.read_csv("../datasets/test_radiomics_hipocamp.csv")
X_test = X_test.select_dtypes(include=[np.number])
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns, index=X_test.index)
#X_test = X_test[features_to_keep]

In [49]:
# Definir os limites das faixas etárias com base nos quartis
bins = [55.3, 71.3, 75.0, 79.9, 91.0]
labels = [1, 2, 3, 4]  # Valores ordinais para cada faixa

# Criar a nova feature numérica com as faixas etárias
X_test['Age_Group'] = pd.cut(X_test['Age'], bins=bins, labels=False, right=False)
X_test = X_test.drop(['Age'], axis=1)     

In [44]:
from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=2022)
model.fit(df_numerico, y)

ValueError: could not convert string to float: 'CN-CN'

In [11]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=100, random_state=2022)
svm.fit(df_numerico, y)

  y = column_or_1d(y, warn=True)


In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit

dt_model = DecisionTreeClassifier(max_depth=6, random_state=2022)
dt_model.fit(df_numerico, y)

sss = StratifiedShuffleSplit(n_splits=10, test_size=20, random_state=2022)
bg_model = BaggingClassifier(estimator=dt_model, bootstrap=True)
n_estimators = [10, 40, 60, 80, 100, 160]
parameters = {'n_estimators': n_estimators}
grid_bg = GridSearchCV(estimator= bg_model, param_grid= parameters, cv= sss)
grid_bg.fit(df_numerico, y)

model = grid_bg.best_estimator_
print(model)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=6,
                                                   random_state=2022),
                  n_estimators=160)


In [12]:
from sklearn.ensemble import StackingClassifier
estimators = [("dt", dt), ("svm", svm), ("rf", rf)]
st_model = StackingClassifier(estimators=estimators, final_estimator= LogisticRegression(random_state=2022))
st_model.fit(df_numerico, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
model.fit(df_numerico, y)

In [90]:
predictions = lr.predict(X_test)
predictions

array(['MCI-AD', 'MCI-MCI', 'AD-AD', 'AD-AD', 'CN-CN', 'AD-AD', 'MCI-AD',
       'CN-CN', 'AD-AD', 'AD-AD', 'MCI-AD', 'AD-AD', 'MCI-AD', 'MCI-AD',
       'AD-AD', 'MCI-MCI', 'MCI-AD', 'AD-AD', 'AD-AD', 'AD-AD', 'CN-CN',
       'AD-AD', 'MCI-AD', 'MCI-MCI', 'MCI-AD', 'MCI-MCI', 'AD-AD',
       'AD-AD', 'CN-MCI', 'CN-MCI', 'CN-MCI', 'MCI-MCI', 'MCI-AD',
       'CN-MCI', 'MCI-AD', 'MCI-AD', 'MCI-AD', 'MCI-AD', 'MCI-MCI',
       'MCI-MCI', 'CN-MCI', 'CN-CN', 'AD-AD', 'CN-MCI', 'CN-MCI', 'AD-AD',
       'CN-CN', 'CN-CN', 'AD-AD', 'MCI-MCI', 'MCI-MCI', 'MCI-MCI',
       'CN-MCI', 'AD-AD', 'MCI-MCI', 'AD-AD', 'CN-CN', 'CN-CN', 'AD-AD',
       'CN-CN', 'CN-CN', 'MCI-AD', 'CN-MCI', 'MCI-AD', 'MCI-AD', 'CN-MCI',
       'AD-AD', 'CN-MCI', 'AD-AD', 'CN-MCI', 'CN-CN', 'MCI-AD', 'CN-CN',
       'AD-AD', 'AD-AD', 'MCI-MCI', 'AD-AD', 'MCI-AD', 'MCI-MCI',
       'CN-MCI', 'CN-MCI', 'MCI-AD', 'AD-AD', 'CN-CN', 'AD-AD', 'MCI-AD',
       'AD-AD', 'AD-AD', 'MCI-MCI', 'CN-MCI', 'AD-AD', 'MCI-MCI',
       'M

In [32]:
predictions = [round(pred) for pred in predictions]

reverse_mapping = {
    0: 'CN-CN',
    1: 'CN-MCI',
    2: 'MCI-MCI',
    3: 'MCI-AD',
    4: 'AD-AD'
}

predictions = [reverse_mapping[pred] for pred in predictions]
predictions

['MCI-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'AD-AD',
 'CN-MCI',
 'MCI-AD',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'MCI-AD',
 'MCI-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-AD',
 'MCI-MCI',
 'MCI-AD',
 'MCI-AD',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'MCI-AD',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-AD',
 'CN-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-AD',
 'CN-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-AD',
 'CN-MCI',
 'AD-AD',
 'CN-CN',
 'CN-MCI',
 'MCI-AD',
 'MCI-MCI',
 'CN-MCI',
 'MCI-AD',
 'MCI-MCI',
 'MCI-AD',
 'MCI-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-AD',
 'CN-MCI',
 'MCI-MCI',
 'MCI-AD',
 'MCI-AD',
 'MCI-MCI',
 'MCI-MCI',
 'MCI-MCI',
 'CN-MCI',
 'CN-MCI',
 'MCI-MCI',
 'MCI-AD',
 'MCI-MCI',
 'MCI-AD',
 'MCI-MCI',
 'MCI-AD',
 'M

In [91]:
data = pd.DataFrame({
    'RowId': np.arange(1, len(predictions) + 1), 
    'Result': predictions
})

data.to_csv('lr_SMOTE.csv', index=False)