In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import warnings
warnings.filterwarnings("ignore")
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import make_scorer, recall_score, precision_score, accuracy_score,f1_score,confusion_matrix
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
%matplotlib inline
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./kaggle/input/'):  # change the argument to:   /kaggle/input
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load data

In [None]:
### train: /kaggle/input/icr-identify-age-related-conditions/train.csv
### test  /kaggle/input/icr-identify-age-related-conditions/test.csv
train_df = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv')
display(train_df)


In [None]:
# Get some initial knowledge of train dataset
print("describe")
display(train_df.describe())
print("info")
display(train_df.info())
# show number of missing values in each colomn which are greater than 0
print("number of missing values")
n_missing_values = round(train_df.isna().sum(),2)
n_missing_values[n_missing_values > 0]

In [None]:
col_with_miss_val = [
'BQ', 'CB', 'CC' ,'DU' ,'EL', 'FC' ,'FL' ,'FS' ,'GL'
]
df_missing_val = train_df[col_with_miss_val]
not_float_cols = ['Class','EJ']
float_cols = train_df.select_dtypes(include=['float64']).columns.tolist()



In [None]:
def plot_distribution(df, columns):

  if columns == float_cols:
    cols = 5
    rows = 11
    figsize=(10,20)
  if columns == not_float_cols:
    cols = 2
    rows = 1
    figsize=(10,4)

  plt.figure(figsize=figsize)
  for idx, feature in enumerate(columns):
      ax = plt.subplot(rows, cols, idx+1)
      plt.hist(x=df[feature])
      ax.set_title(feature)
      ax.set_ylabel("Count")

  plt.tight_layout()
  plt.show()


In [None]:
plot_distribution(train_df,columns=float_cols)
plot_distribution(train_df,columns=not_float_cols)


In [None]:
train_df.columns

# Preprocessing

In [None]:
def data_imputation(df, target):
  imp = IterativeImputer(max_iter=10, random_state=42)
  imp.fit(df)
  df_imputed = imp.transform(df)
  df_imputed = pd.DataFrame(df_imputed, columns=float_cols)
  df_imputed['Class'] = target
  df_imputed['Id'] = train_df['Id']
  df_imputed['EJ'] = train_df['EJ']
  # reorder the columns
  column_to_move = 'EJ'
  before_column = 'EL'
  position = df_imputed.columns.get_loc(before_column)
  column = df_imputed.pop(column_to_move)
  df_imputed.insert(position, column_to_move, column)
  return df_imputed

In [None]:
train_df = data_imputation(df= train_df.loc[:,float_cols], target =train_df['Class'] )
# check the number of missing values
train_df.info()

In [None]:
train_df.columns

## Feature engineering

In [None]:
from imblearn.over_sampling import RandomOverSampler
def feature_engineering(df):
  target = df['Class']

  # convert categorical cols to binary
  EJ = df['EJ'].map({'A': 0, 'B': 1}).astype(int)
  df = df.drop(columns=['Id','Class','EJ'])
  # Normalization of Dataset
  #scaler = StandardScaler()
  #df.iloc[:,0:-1] = scaler.fit_transform(df.iloc[:,0:-1].to_numpy())
  normalized_df = pd.DataFrame(df)
  normalized_df['Class'] = target
  normalized_df['EJ'] = EJ
  print(normalized_df.shape)
  # Assuming you have your DataFrame called 'df' and the column 'EJ'
  X = normalized_df.drop('EJ', axis=1) # Features
  y = normalized_df['EJ']              # Target

  # Initialize the RandomOverSampler
  over_sampler = RandomOverSampler(random_state=42)

  # Resample the data
  X_resampled, y_resampled = over_sampler.fit_resample(X, y)
  print(X_resampled.shape)
  #display(X_resampled)
  normalized_df = X_resampled
  normalized_df['EJ'] = y_resampled

  # reorder columns
  column_to_move = 'EJ'
  before_column = 'EL'
  position = normalized_df.columns.get_loc(before_column)
  column = normalized_df.pop(column_to_move)
  normalized_df.insert(position, column_to_move, column)

  return normalized_df

In [None]:
train_copy_df = train_df.copy(deep=True)
df = feature_engineering(train_copy_df)
df


In [None]:
df['Class'].value_counts()

In [None]:
df['EJ'].value_counts()

In [None]:
df.columns

In [None]:
# split data into train,test, and validation
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Class']),df["Class"],test_size=0.2,random_state=42,stratify=df["Class"])
display(X_train.columns)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

def rf_training(X_train, y_train):
  # initialize model
  model = RandomForestClassifier(max_depth= 10, min_samples_leaf= 1, min_samples_split= 2, n_estimators= 300, class_weight='balanced', random_state=42)

  # Apply SMOTE to handle class imbalance on the training set
  smote = SMOTE(random_state=42)
  X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
  # Perform cross-validation on the training set
  # cv_scores_recall = cross_val_score(model, X_train_resampled, y_train_resampled, cv=10, scoring='recall')
  # cv_scores_precision = cross_val_score(model, X_train_resampled, y_train_resampled, cv=10, scoring='precision')

  #print("Cross-Validation Recall Scores:", cv_scores_recall)
  # print("Cross-Validation Precision Scores:", cv_scores_precision)

  # Use the mean of the cross-validation scores to set the hyperparameters
  model.fit(X_train_resampled, y_train_resampled)

  # Step 6: Evaluate the model on the validation set
 #  y_val_pred = model.predict(X_val)
  #val_recall = recall_score(y_val, y_val_pred)
 # val_precision = precision_score(y_val, y_val_pred)

 # print("Validation Recall:", val_recall)
  #print("Validation Precision:", val_precision)
  return model
def rf_evaluation(model,X_test, y_test):
  # Evaluate the model on the test set
  y_test_pred = model.predict(X_test)
  test_recall = recall_score(y_test, y_test_pred)
  test_precision = precision_score(y_test, y_test_pred)
  test_accuracy = accuracy_score(y_test,y_test_pred)
  test_f1_score = f1_score(y_test,y_test_pred)
  test_confusion_M = confusion_matrix(y_test,y_test_pred)
  print("Test Recall:", test_recall)
  print("Test Precision:", test_precision)
  print("Test accuracy:", test_accuracy)
  print("Test f1_score:", test_f1_score)
  print("Test confusion_Matrix:\n", test_confusion_M)

In [None]:
rf_classifier = rf_training(X_train , y_train)
rf_evaluation(rf_classifier,X_test, y_test)

# Hyperparameter tuning

In [None]:
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(class_weight='balanced')

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [200, 300],          # Number of trees in the forest
    'max_depth': [10],              # Maximum depth of the tree
    'min_samples_split': [1,2, 5, 10],          # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1,2, 4],             # Minimum number of samples required to be at a leaf node
    'criterion' : ['gini','entropy'],
    'random_state': [1,20,42,52]
    }



# Initialize GridSearchCV with the Random Forest model and hyperparameter grid
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='recall', refit='recall')
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


# Fit the GridSearchCV on the training data
grid_search.fit(X_train_resampled, y_train_resampled)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
rf_evaluation(best_model,X_test, y_test)

In [None]:
best_params

# Test & Submission

In [None]:
test_df =  pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
test_df['EJ'] = test_df['EJ'].map({'A': 0, 'B': 1}).astype(int)
features = test_df.drop(['Id'], axis=1)
features

In [None]:
y_test_pred = best_model.predict_proba(features)
y_test_pred

In [None]:
predictions = pd.DataFrame(y_test_pred)

# add test Id
predictions['Id'] = test_df['Id']
predictions['class_0'] = predictions.iloc[:,0]
predictions['class_1'] = predictions.iloc[:,1]
predictions = predictions[['Id','class_0','class_1']]

In [None]:
sample_submission = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv")
sample_submission[['class_0', 'class_1']] = predictions[['class_0', 'class_1']]
sample_submission

In [None]:
sample_submission.to_csv('/kaggle/working/submission.csv', index=False)
pd.read_csv("/kaggle/working/submission.csv")