In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Load data
data = pd.read_csv('../cleaned_data/cleaned_data.csv')

display(data.describe())
display(data.head())


# Plotting histograms for numerical features
plt.figure(figsize=(18, 18))
for i, col in enumerate(data.drop(['Attrition_Flag'], axis=1).select_dtypes(include=['int','float']).columns):
    ax = plt.subplot(4, 4, i+1)  # Creating a subplot for each column.
    sns.histplot(data=data, x=col, ax=ax, color='red', kde=True)
    ax.tick_params(axis='x', labelsize=14)
    ax.tick_params(axis='y', labelsize=14)
    ax.set_xlabel(col, fontsize=18)
    ax.set_ylabel('Count', fontsize=18)
plt.suptitle('Data distribution of continuous variables',fontsize=24, y=1.01)
plt.tight_layout()
plt.show()

# Correlation between numerical features: heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(data.select_dtypes(include=['int', 'float']).corr(), annot=True, center=0,cmap='viridis',annot_kws={'size': 12})
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.title("Feature correlation",fontsize=24, y=1.01)
plt.show()

# Plot categorical features
columns_to_visualize = ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

plt.figure(figsize=(15, 12))
for i, column in enumerate(columns_to_visualize, 1):
    plt.subplot(2, 3, i)  # 2 rows, 3 columns of subplots
    sns.countplot(x=column, data=data, hue='Attrition_Flag', palette='viridis', legend=False)
    plt.title(f'Distribution of {column} vs Attrition')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility

plt.tight_layout()  # Adjust layout to prevent overlapping
plt.show()

# Encoding ordinal features
columns = ['Income_Category','Card_Category', 'Education_Level','Marital_Status']
for col in columns:
  data = pd.concat([data,pd.get_dummies(data[col], drop_first=True).astype(int)], axis=1)
data.drop(columns=columns, inplace=True)

from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Prepare and split data
X = data.drop(columns='Attrition_Flag').to_numpy()
y = data['Attrition_Flag'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Initialize the base models
xgb_model = xgb.XGBClassifier(eval_metric='mlogloss')
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Define the models for comparison
models = [
    ('XGBoost', xgb_model),
    ('Random Forest', rf_model),
    ('Voting Classifier', VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model)], voting='soft'))
]

# Store the model evaluation results
model_comparison = {}

# Train and evaluate models, calculate results
for model_name, classifier in models:
    # Fit the model
    classifier.fit(X_train, y_train)
    # Make predictions on the test set
    y_pred = classifier.predict(X_test)
    # Calculate score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    # Cross-validation
    cross_val_accuracy = cross_val_score(estimator=classifier, X=X_train, y=y_train, cv=5, scoring="accuracy")
    cv_accuracy = cross_val_accuracy.mean()
    cv_std = cross_val_accuracy.std()
    # Accuracy
    accuracy_class_0 = accuracy_score(y_test[y_test == 0], y_pred[y_test == 0])
    accuracy_class_1 = accuracy_score(y_test[y_test == 1], y_pred[y_test == 1])
    # Print metrics
    print(f"Model: {model_name}")
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    print(f"Model F1-Score: {f1 * 100:.2f}%")
    print(f"Cross Val Accuracy: {cv_accuracy * 100:.2f}%")
    print(f"Cross Val Standard Deviation: {cv_std * 100:.2f}%")
    print(f"Accuracy for Class 0: {accuracy_class_0 * 100:.2f}%")
    print(f"Accuracy for Class 1: {accuracy_class_1 * 100:.2f}%")

    # Add metrics to model_comparison
    model_comparison[model_name] = {
        'accuracy': accuracy,
        'accuracy_class_0': accuracy_class_0,
        'accuracy_class_1': accuracy_class_1,
        'f1_score': f1,
        'cv_accuracy': cv_accuracy,
        'cv_std': cv_std
    }
    # Print classification report
    print(classification_report(y_test, y_pred, zero_division=1))
    print("-" * 60)

    #MODEL COMPARISSON

Model_com_df=pd.DataFrame(model_comparison).T
Model_com_df.columns=['Model F1-Score','Model Accuracy','Model Accuracy-0','Model Accuracy-1','CV Accuracy','CV std']
Model_com_df=Model_com_df.sort_values(by='Model F1-Score',ascending=False)
Model_com_df.style.format("{:.2%}")


# Confusion matrix

from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import itertools

def plot_confusion_matrix(cm, classes, normalize=False, cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title('Confusion matrix', fontsize=18)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks)
    plt.yticks(tick_marks)
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", 
                 fontsize=16, color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label', fontsize=15)
    plt.xlabel('Predicted label', fontsize=15)

# Confusion matrix and plotting
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
plot_confusion_matrix(cm, classes=['y=0','y=1'], normalize=True)

Model_com_df=pd.DataFrame(model_comparison).T
Model_com_df.columns=['Model F1-Score','Model Accuracy','Model Accuracy-0','Model Accuracy-1','CV Accuracy','CV std']
Model_com_df=Model_com_df.sort_values(by='Model F1-Score',ascending=False)
Model_com_df.style.format("{:.2%}")

