In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from deap import base, creator, tools, algorithms
import numpy as np
import os
from sklearn.impute import SimpleImputer

# Step 1: Load datasets and perform EDA on 'application_record' and 'credit_record'
def perform_eda(df, name):
    print(f"\nEDA for {name}:")
    print(f"Dataset Shape: {df.shape}")
    print(f"Dataset Columns: {df.columns}")

    # Check data types and non-null counts
    print("\nData Types and Non-Null Counts:")
    print(df.info())

    # Check for missing values
    print("\nMissing Values:")
    print(df.isnull().sum())

    # Check basic statistics for numeric columns
    print("\nSummary Statistics:")
    print(df.describe())

    # Check unique counts for each column
    print("\nUnique Counts for Each Column:")
    unique_counts = df.nunique()
    print(unique_counts)

    # Display the first few rows
    print("\nFirst 5 rows (df.head()):")
    print(df.head())

    # Display the last few rows
    print("\nLast 5 rows (df.tail()):")
    print(df.tail())

    # Check unique values in key categorical columns (Occupation type for example)
    if 'OCCUPATION_TYPE' in df.columns:
        print("\nUnique values in 'OCCUPATION_TYPE':")
        print(df['OCCUPATION_TYPE'].value_counts(dropna=False))

    return df

# EDA for 'application_record.csv'
df1 = pd.read_csv('application_record.csv')
df1 = perform_eda(df1, 'application_record')

# EDA for 'credit_record.csv'
df2 = pd.read_csv('credit_record.csv')
df2 = perform_eda(df2, 'credit_record')

# Step 2: Merge datasets
# if os.path.exists("mergedDataset.csv"):
#     print("Merged dataset found. Loading it...")
#     final_df = pd.read_csv("mergedDataset.csv")
# else:
#     print("Merging datasets...")
#     final_df = pd.merge(df1, df2, on="ID", how="inner")
#     final_df.to_csv("mergedDataset.csv", index=False)

print('----------------------------------------------------------------------------\n');
print("Number of classes:", df2["STATUS"].nunique())
print("Actual classes:", df2["STATUS"].unique())

# Step 5: Map 'STATUS' to binary labels
status_mapping = {'C': 0, 'X': 0, '0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1}
df2["STATUS"] = df2["STATUS"].map(status_mapping)

print('----------------------------------------------------------------------------\n');
print("Number of classes:", df2["STATUS"].nunique())
print("Actual classes:", df2["STATUS"].unique())

final_df = pd.merge(df1, df2, on="ID", how="inner")

print('----------------------------------------------------------------------------\n');
print("Number of classes:", final_df["STATUS"].nunique())
print("Actual classes:", final_df["STATUS"].unique())

# Step 3: Encode categorical features
labelEncoder = LabelEncoder()
dataToEncode = [
    "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY",
    "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "OCCUPATION_TYPE"
]

for label in dataToEncode:
    if label in final_df.columns:
        final_df[label + "_Encoded"] = labelEncoder.fit_transform(final_df[label])
    else:
        print(f"Column '{label}' not found in the dataset.")

# Drop original categorical columns after encoding
final_df.drop(columns=[col for col in dataToEncode if col in final_df.columns], inplace=True)

# Step 4: Move 'STATUS' column to the end
if 'STATUS' in final_df.columns:
    column_to_move = 'STATUS'
    last_column = final_df.pop(column_to_move)
    final_df[column_to_move] = last_column







print("******************************************************************")
print("FINAL DF IS NAN",final_df["STATUS"].isna().sum())
print("FINAL DF IS NULL",final_df["STATUS"].isnull().sum())
print("******************************************************************")


# Step 6: Check and clean data
# Print initial count of duplicates and missing values
print(f"Initial number of duplicates: {final_df.duplicated().sum()}")
print(f"Initial number of missing values: {final_df.isnull().sum().sum()}")

# Drop duplicates
final_df.drop_duplicates(inplace=True)

# Fill missing values with -1
final_df.fillna(-1, inplace=True)

# Print number of duplicates and missing values after cleaning
print(f"After cleaning, number of duplicates: {final_df.duplicated().sum()}")
print(f"After cleaning, number of missing values: {final_df.isnull().sum().sum()}")


print("******************************************************************")
print("FINAL DF IS NAN",final_df["STATUS"].isna().sum())
print("FINAL DF IS NULL",final_df["STATUS"].isnull().sum())
print("******************************************************************")


# Identify garbage values and handle them
# Example: If any numeric columns have negative values where they shouldn't, replace them with median or a suitable value
for column in final_df.select_dtypes(include=['int64', 'float64']).columns:
    # Identify negative values or out-of-range values (example: negative age or income)
    if final_df[column].min() < 0:
        print(f"Found negative values in column '{column}'. Replacing with median.")
        median_value = final_df[column][final_df[column] >= 0].median()  # Median of valid values
        final_df[column] = final_df[column].apply(lambda x: median_value if x < 0 else x)

# Example: Check for unexpected values in categorical columns
# Adjust as necessary based on your dataset's requirements
categorical_columns = [
    "CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "OCCUPATION_TYPE"
]

for col in categorical_columns:
    if col in final_df.columns:
        unique_values = final_df[col].unique()
        print(f"Unique values in column '{col}': {unique_values}")
        # Check for any unexpected or unwanted values, for example
        if col == 'OCCUPATION_TYPE':  # Example: If there are any unknown occupations
            invalid_values = final_df[~final_df[col].isin(['high_skill', 'low_skill', 'other'])]  # Adjust categories as needed
            print(f"Found invalid values in '{col}': {invalid_values[col].unique()}")
            final_df[col] = final_df[col].apply(lambda x: 'other' if x not in ['high_skill', 'low_skill', 'other'] else x)

# After cleaning, print summary
print(f"After cleaning, number of records: {final_df.shape[0]}")

# Step 7: Save the cleaned dataset back to the same file
final_df.to_csv("mergedDataset.csv", index=False)
print("Cleaned dataset saved to 'mergedDataset.csv'")

# Step 8: Handle missing values and impute missing data using SimpleImputer
X = final_df.drop(columns=["STATUS"])
y = final_df["STATUS"]

y.fillna('1',inplace=True)

print("***********************************************\n" , y.isnull().sum() )
print("Number of classes:", y.nunique())
print("Actual classes:", y.unique())

# Impute missing values in X (features)
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent'
X_imputed = imputer.fit_transform(X)  # Apply imputation to the feature set

# Print if there are still missing values
print(f"Missing values after imputation: {pd.isnull(X_imputed).sum()}")

# Step 9: Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_imputed, y)


print(X.columns)


EDA for application_record:
Dataset Shape: (438557, 18)
Dataset Columns: Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS'],
      dtype='object')

Data Types and Non-Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAM



Missing values after imputation: 0
Index(['ID', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL',
       'CNT_FAM_MEMBERS', 'MONTHS_BALANCE', 'CODE_GENDER_Encoded',
       'FLAG_OWN_CAR_Encoded', 'FLAG_OWN_REALTY_Encoded',
       'NAME_INCOME_TYPE_Encoded', 'NAME_EDUCATION_TYPE_Encoded',
       'NAME_FAMILY_STATUS_Encoded', 'NAME_HOUSING_TYPE_Encoded',
       'OCCUPATION_TYPE_Encoded'],
      dtype='object')


In [None]:
# Step 10: Split data into Train (70%), Validation (15%), and Test (15%)
X_train_full, X_test, y_train_full, y_test = train_test_split(X_resampled, y_resampled, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1765, random_state=42)


originalDfColumns = ['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL',
       'CNT_FAM_MEMBERS', 'MONTHS_BALANCE', 'CODE_GENDER_Encoded',
       'FLAG_OWN_CAR_Encoded', 'FLAG_OWN_REALTY_Encoded',
       'NAME_INCOME_TYPE_Encoded', 'NAME_EDUCATION_TYPE_Encoded',
       'NAME_FAMILY_STATUS_Encoded', 'NAME_HOUSING_TYPE_Encoded',
       'OCCUPATION_TYPE_Encoded']




# Scale numerical data
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=originalDfColumns)
X_val = pd.DataFrame(scaler.transform(X_val), columns=originalDfColumns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=originalDfColumns)

# Step 11: Genetic Algorithm for Feature Selection
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(X_train.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

def evaluate(individual):
    selected_features = [feature for feature, select in zip(X_train.columns, individual) if select == 1]
    if len(selected_features) == 0:
        return 0,
    clf = DecisionTreeClassifier(random_state=42)
    clf.fit(X_train[selected_features], y_train)
    y_val_pred = clf.predict(X_val[selected_features])
    return accuracy_score(y_val, y_val_pred),  # Fitness is validation accuracy

toolbox.register("evaluate", evaluate)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=3)

# Step 12: Optimize with Genetic Algorithm
population = toolbox.population(n=20)
ngen, cxpb, mutpb = 10, 0.7, 0.3
result = algorithms.eaSimple(population, toolbox, cxpb=cxpb, mutpb=mutpb, ngen=ngen, verbose=True)

# Step 13: Extract the best features
best_individual = tools.selBest(population, k=1)[0]
selected_features = [feature for feature, select in zip(X_train.columns, best_individual) if select == 1]
print("\nSelected Features:", selected_features)

# Step 14: Hyperparameter Tuning with RandomizedSearchCV




gen	nevals
0  	20    
1  	15    
2  	17    
3  	16    
4  	19    
5  	13    
6  	15    
7  	16    
8  	15    
9  	15    
10 	14    

Selected Features: ['CNT_CHILDREN', 'DAYS_BIRTH', 'FLAG_WORK_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'FLAG_OWN_REALTY_Encoded', 'NAME_INCOME_TYPE_Encoded', 'NAME_EDUCATION_TYPE_Encoded', 'NAME_HOUSING_TYPE_Encoded', 'OCCUPATION_TYPE_Encoded']

Tuning hyperparameters for KNN...




KNN Random Search Best Parameters: {'weights': 'distance', 'n_neighbors': 4, 'metric': 'euclidean'}
KNN Random Search Best Score: 0.75

KNN Test Accuracy with Best Model: 0.75
              precision    recall  f1-score   support

           0       0.71      0.85      0.77     71147
           1       0.81      0.65      0.72     71499

    accuracy                           0.75    142646
   macro avg       0.76      0.75      0.75    142646
weighted avg       0.76      0.75      0.75    142646



In [None]:
param_dist_dt = {
    "max_depth": [3, 5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

param_dist_knn = {
    "n_neighbors": [3,4,5,9],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"]
}

param_dist_mlp = {
    "hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "activation": ["relu", "tanh"],
    "solver": ["adam", "sgd"],
    "learning_rate": ["constant", "adaptive"]
}

models = {
    "KNN": (KNeighborsClassifier(), param_dist_knn),
    "DecisionTree": (DecisionTreeClassifier(random_state=42), param_dist_dt),
    "MLP": (MLPClassifier(random_state=42, max_iter=300), param_dist_mlp)
}

for model_name, (model, param_dist) in models.items():
    print(f"\nTuning hyperparameters for {model_name}...")

    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, random_state=42, scoring="accuracy", n_jobs=-1)
    random_search.fit(X_train[selected_features], y_train)

    print(f"{model_name} Random Search Best Parameters: {random_search.best_params_}")
    print(f"{model_name} Random Search Best Score: {random_search.best_score_:.2f}")

    # Evaluate on test set using the best RandomizedSearchCV model
    best_model = random_search.best_estimator_
    y_test_pred = best_model.predict(X_test[selected_features])
    print(f"\n{model_name} Test Accuracy with Best Model: {accuracy_score(y_test, y_test_pred):.2f}")
    print(classification_report(y_test, y_test_pred))


Tuning hyperparameters for KNN...


KeyboardInterrupt: 