In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import joblib

# Define the function to categorize ages


def categorize_age(age):
    if 30 <= age <= 40:
        return "30-40"
    elif 41 <= age <= 50:
        return "41-50"
    elif 51 <= age <= 60:
        return "51-60"
    elif 61 <= age <= 70:
        return "61-70"
    else:
        return "Out of Range"

# Define the function to categorize working years


def categorize_wrk_yrs(wrkyrs):
    if 5 <= wrkyrs <= 10:
        return "5-10"
    elif 11 <= wrkyrs <= 20:
        return "11-20"
    elif 21 <= wrkyrs <= 30:
        return "21-30"
    elif 31 <= wrkyrs <= 40:
        return "31-40"
    elif 41 <= wrkyrs <= 45:
        return "41-45"
    else:
        return "Out of Range"


# Path to your CSV file
file_path = 'data_simulation.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Verify the column presence before any processing
print("\nColumns in original DataFrame:")
print(df.columns)

# Define the number of quantiles
num_bins = 5
df['TFR_Category'], bins = pd.qcut(df['TFR'], q=num_bins, labels=[
                                   'Molto Basso', 'Basso', 'Medio', 'Alto', 'Molto Alto'], retbins=True)

# Apply the function to the DataFrame to create new columns
df['AGE_Category'] = df['AGE'].apply(categorize_age)
df['anni_lavorativi_Category'] = df['anni lavorativi'].apply(
    categorize_wrk_yrs)

# Verify the column presence after adding new categories
print("\nColumns after adding categories:")
print(df.columns)

# Dropping columns that are not useful for the prediction
df.drop(columns=['NOME', 'COGNOME', 'TELEFONO', 'CELLULARE', 'EMAIL', 'COMUNE', 'CAP', 'INDIRIZZO', 'CODICE_FISCALE', 'IBAN', 'COMUNE_NASCITA', 'AGE', 'anni lavorativi', 'TFR',
        'NOME_AZIENDA', 'CODICE_FISCALE_AZIENDA', 'PARTITA_IVA_AZIENDA', 'DOCUMENTAZIONE_PENSIONATO', 'REGISTRAZIONE_TEL_PRIMO_CONTATTO', 'NOTE_LAVORAZIONE_CONTATTO'], inplace=True)

# Verify the column presence after dropping columns
print("\nColumns after dropping unnecessary columns:")
print(df.columns)

# Encode categorical variables using LabelEncoder
encoders = {}
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        encoders[column] = le

# Verify the column presence after encoding
print("\nColumns after encoding:")
print(df.columns)

# Split the target variable
y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Save the trained model to a binary file
joblib.dump(clf, 'random_forest_model.pkl')
joblib.dump(encoders, 'encoders.pkl')

# Get feature importances
importances = clf.feature_importances_
feature_importances = pd.DataFrame(
    {'feature': X.columns, 'importance': importances})
feature_importances.sort_values(by='importance', ascending=False, inplace=True)

# Print the feature importances
print("\nFeature Importances:")
print(feature_importances)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"\nAccuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Example new customer profile
new_customer = {
    'IMPORTO_RICHIESTO': 15000,
    'TIPO DI OCCUPAZIONE': 'Architetto comunale',
    'PROVINCIA': 'Roma',
    'CONSENSO_DATI_PRIVACY': 1,
    'CONSENSO_DATI_MRKTG': 1,
    'CONSENSO_DATI_CESSIONE_TERZI': 0,
    'SESSO': 'F',
    'REGIONE': 'Lazio',
    'IMPORTO_STIPENDIO_PENSIONE': 2000,
    'AGE_Category': '41-50',
    'anni_lavorativi_Category': '21-30',
    'TFR_Category': "Basso",
    'TIPO_AZIENDA': 'Pubblica',
    'TEMPO_INDETERMINATO': 1,
    'PREVENTIVI_CONCORRENZA': 0,
    'TRATTENUTE_BUSTA_PAGA_PENSIONE': 0,
    'ALTRI_FINANZIAMENTI_PRESENTI': 1
}

# Load the trained model and encoders
clf_loaded = joblib.load('random_forest_model.pkl')
encoders_loaded = joblib.load('encoders.pkl')

# Encode the new customer profile using the loaded LabelEncoders
for column in new_customer:
    if column in encoders_loaded:
        new_customer[column] = encoders_loaded[column].transform(
            [new_customer[column]])[0]

# Convert the new customer profile to a DataFrame
# Convert to DataFrame
new_customer_df = pd.DataFrame([new_customer])
# Ensure the new customer data has the same columns as the training data
new_customer_df = new_customer_df.reindex(columns=X.columns)
# Now you can make a prediction
predicted_motivation = clf.predict(new_customer_df)
predicted_motivation_label = encoders['MOTIVAZIONE_PRESTITO'].inverse_transform(
    predicted_motivation)

print(
    f"Predicted loan motivation for the new customer: {predicted_motivation_label[0]}")


# Predict the probabilities
predicted_probabilities = clf.predict_proba(new_customer_df)

# Convert the probabilities into a DataFrame
probabilities_df = pd.DataFrame(
    predicted_probabilities, columns=encoders['MOTIVAZIONE_PRESTITO'].classes_)

print(
    f"Predicted loan motivation for the new customer: {predicted_motivation_label[0]}")

print("Probabilities for each class:")
print(probabilities_df.transpose())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import joblib

# Define the function to categorize ages


def categorize_age(age):
    if 30 <= age <= 40:
        return "30-40"
    elif 41 <= age <= 50:
        return "41-50"
    elif 51 <= age <= 60:
        return "51-60"
    elif 61 <= age <= 70:
        return "61-70"
    else:
        return "Out of Range"

# Define the function to categorize working years


def categorize_wrk_yrs(wrkyrs):
    if 5 <= wrkyrs <= 10:
        return "5-10"
    elif 11 <= wrkyrs <= 20:
        return "11-20"
    elif 21 <= wrkyrs <= 30:
        return "21-30"
    elif 31 <= wrkyrs <= 40:
        return "31-40"
    elif 41 <= wrkyrs <= 45:
        return "41-45"
    else:
        return "Out of Range"


# Path to your CSV file
file_path = 'data_simulation.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Define the number of quantiles
num_bins = 5
df['TFR_Category'], bins = pd.qcut(df['TFR'], q=num_bins, labels=[
                                   'Molto Basso', 'Basso', 'Medio', 'Alto', 'Molto Alto'], retbins=True)

# Apply the function to the DataFrame to create new columns
df['AGE_Category'] = df['AGE'].apply(categorize_age)
df['anni_lavorativi_Category'] = df['anni lavorativi'].apply(
    categorize_wrk_yrs)

# Dropping columns that are not useful for the prediction
df.drop(columns=['NOME', 'COGNOME', 'TELEFONO', 'CELLULARE', 'EMAIL', 'COMUNE', 'CAP', 'INDIRIZZO', 'CODICE_FISCALE', 'IBAN', 'COMUNE_NASCITA', 'AGE', 'anni lavorativi', 'TFR',
        'NOME_AZIENDA', 'CODICE_FISCALE_AZIENDA', 'PARTITA_IVA_AZIENDA', 'DOCUMENTAZIONE_PENSIONATO', 'REGISTRAZIONE_TEL_PRIMO_CONTATTO', 'NOTE_LAVORAZIONE_CONTATTO'], inplace=True)


# Encode categorical variables using LabelEncoder
encoders = {}
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        encoders[column] = le


# Split the target variable
y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Save the trained model to a binary file
joblib.dump(clf, 'random_forest_model.pkl')
joblib.dump(encoders, 'encoders.pkl')

# Load the trained model and encoders
clf_loaded = joblib.load('random_forest_model.pkl')
encoders_loaded = joblib.load('encoders.pkl')

# Get feature importances
importances = clf.feature_importances_

# Convert the importances into a DataFrame
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort the DataFrame by importance
feature_importances.sort_values(by='importance', ascending=False, inplace=True)

# Print the feature importances
print("\n")
print(feature_importances)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"\nAccuracy: {accuracy_score(y_test, y_pred)}")
print("\n")
print(classification_report(y_test, y_pred))


# Example new customer profile
new_customer = {
    'IMPORTO_RICHIESTO': 15000,
    #'TIPO DI OCCUPAZIONE': 'Architetto comunale',
    'TIPO DI OCCUPAZIONE': 'Dentista',
    'PROVINCIA': 'Roma',
    'CONSENSO_DATI_PRIVACY': 1,
    'CONSENSO_DATI_MRKTG': 1,
    'CONSENSO_DATI_CESSIONE_TERZI': 0,
    'SESSO': 'F',
    'REGIONE': 'Lazio',
    'IMPORTO_STIPENDIO_PENSIONE': 2000,
    'AGE_Category': '41-50',
    'anni_lavorativi_Category': '21-30',
    'TFR_Category': "Basso",
    'TIPO_AZIENDA': 'Pubblica',
    'TEMPO_INDETERMINATO': 1,
    'PREVENTIVI_CONCORRENZA': 0,
    'TRATTENUTE_BUSTA_PAGA_PENSIONE': 0,
    'ALTRI_FINANZIAMENTI_PRESENTI': 1
}

# Encode the new customer profile using the loaded LabelEncoders
for column in new_customer:
    if column in encoders_loaded:
        new_customer[column] = encoders_loaded[column].transform(
            [new_customer[column]])[0]

# Convert the new customer profile to a DataFrame
new_customer_df = pd.DataFrame([new_customer])

# Ensure the new customer dataframe has the same columns as the training set
missing_cols = set(X.columns) - set(new_customer_df.columns)
for col in missing_cols:
    new_customer_df[col] = 0
new_customer_df = new_customer_df[X.columns]

# Predict the MOTIVAZIONE_PRESTITO for the new customer using the loaded model
predicted_motivation = clf_loaded.predict(new_customer_df)
predicted_motivation_label = encoders['MOTIVAZIONE_PRESTITO'].inverse_transform(
    predicted_motivation)

print(f"Predicted loan motivation for the new customer: {predicted_motivation_label[0]}")

# Predict the probabilities
predicted_probabilities = clf_loaded.predict_proba(new_customer_df)

# Convert the probabilities into a DataFrame
probabilities_df = pd.DataFrame(
    predicted_probabilities, columns=encoders['MOTIVAZIONE_PRESTITO'].classes_)

print("Probabilities for each class:")
print(probabilities_df.transpose())

# To retrain the model with new data, combine the new data with the existing training data
X_train = pd.concat([X_train, new_customer_df])
y_train = pd.concat([y_train, pd.Series(
    encoders['MOTIVAZIONE_PRESTITO'].transform(predicted_motivation_label))])

# Retrain the model with the updated training data
clf_loaded.fit(X_train, y_train)

# Save the retrained model
joblib.dump(clf_loaded, 'random_forest_model_retrained.pkl')

print("\nModel retrained and saved successfully.")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import joblib

# Define the function to categorize ages


def categorize_age(age):
    if 30 <= age <= 40:
        return "30-40"
    elif 41 <= age <= 50:
        return "41-50"
    elif 51 <= age <= 60:
        return "51-60"
    elif 61 <= age <= 70:
        return "61-70"
    else:
        return "Out of Range"

# Define the function to categorize working years


def categorize_wrk_yrs(wrkyrs):
    if 5 <= wrkyrs <= 10:
        return "5-10"
    elif 11 <= wrkyrs <= 20:
        return "11-20"
    elif 21 <= wrkyrs <= 30:
        return "21-30"
    elif 31 <= wrkyrs <= 40:
        return "31-40"
    elif 41 <= wrkyrs <= 45:
        return "41-45"
    else:
        return "Out of Range"


# Path to your CSV file
file_path = 'data_simulation.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Verify the column presence before any processing
print("\nColumns in original DataFrame:")
print(df.columns)

# Define the number of quantiles
num_bins = 5
df['TFR_Category'], bins = pd.qcut(df['TFR'], q=num_bins, labels=[
                                   'Molto Basso', 'Basso', 'Medio', 'Alto', 'Molto Alto'], retbins=True)

# Apply the function to the DataFrame to create new columns
df['AGE_Category'] = df['AGE'].apply(categorize_age)
df['anni_lavorativi_Category'] = df['anni lavorativi'].apply(
    categorize_wrk_yrs)

# Verify the column presence after adding new categories
print("\nColumns after adding categories:")
print(df.columns)

# Dropping columns that are not useful for the prediction
df.drop(columns=['NOME', 'COGNOME', 'TELEFONO', 'CELLULARE', 'EMAIL', 'COMUNE', 'CAP', 'INDIRIZZO', 'CODICE_FISCALE', 'IBAN', 'COMUNE_NASCITA', 'AGE', 'anni lavorativi', 'TFR',
        'NOME_AZIENDA', 'CODICE_FISCALE_AZIENDA', 'PARTITA_IVA_AZIENDA', 'DOCUMENTAZIONE_PENSIONATO', 'REGISTRAZIONE_TEL_PRIMO_CONTATTO', 'NOTE_LAVORAZIONE_CONTATTO'], inplace=True)

# Verify the column presence after dropping columns
print("\nColumns after dropping unnecessary columns:")
print(df.columns)

# Encode categorical variables using LabelEncoder
encoders = {}
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        encoders[column] = le

# Verify the column presence after encoding
print("\nColumns after encoding:")
print(df.columns)

# Split the target variable
y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Save the trained model to a binary file
joblib.dump(clf, 'random_forest_model.pkl')
joblib.dump(encoders, 'encoders.pkl')

# Get feature importances
importances = clf.feature_importances_
feature_importances = pd.DataFrame(
    {'feature': X.columns, 'importance': importances})
feature_importances.sort_values(by='importance', ascending=False, inplace=True)

# Print the feature importances
print("\nFeature Importances:")
print(feature_importances)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"\nAccuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Load the trained model and encoders
clf_loaded = joblib.load('random_forest_model.pkl')
encoders_loaded = joblib.load('encoders.pkl')

# Example new customer profile
new_customer = {
    'IMPORTO_RICHIESTO': 15000,
    # Intentionally using an unseen label
    'TIPO DI OCCUPAZIONE': 'Architetto comunale',
    'PROVINCIA': 'Roma',
    'CONSENSO_DATI_PRIVACY': 1,
    'CONSENSO_DATI_MRKTG': 1,
    'CONSENSO_DATI_CESSIONE_TERZI': 0,
    'SESSO': 'F',
    'REGIONE': 'Lazio',
    'IMPORTO_STIPENDIO_PENSIONE': 2000,
    'AGE_Category': '41-50',
    'anni_lavorativi_Category': '21-30',
    'TFR_Category': "Basso",
    'TIPO_AZIENDA': 'Pubblica',
    'TEMPO_INDETERMINATO': 1,
    'PREVENTIVI_CONCORRENZA': 0,
    'TRATTENUTE_BUSTA_PAGA_PENSIONE': 0,
    'ALTRI_FINANZIAMENTI_PRESENTI': 1
}

try:
    # Encode the new customer profile using the loaded LabelEncoders
    for column in new_customer:
        if column in encoders_loaded:
            new_customer[column] = encoders_loaded[column].transform(
                [new_customer[column]])[0]

    # Convert the new customer profile to a DataFrame
    new_customer_df = pd.DataFrame([new_customer])

    # Ensure the new customer dataframe has the same columns as the training set
    missing_cols = set(X.columns) - set(new_customer_df.columns)
    for col in missing_cols:
        new_customer_df[col] = 0
    new_customer_df = new_customer_df[X.columns]

    # Predict the MOTIVAZIONE_PRESTITO for the new customer using the loaded model
    predicted_motivation = clf_loaded.predict(new_customer_df)
    predicted_motivation_label = encoders['MOTIVAZIONE_PRESTITO'].inverse_transform(
        predicted_motivation)

    print(
        f"Predicted loan motivation for the new customer: {predicted_motivation_label[0]}")

    # Predict the probabilities
    predicted_probabilities = clf_loaded.predict_proba(new_customer_df)

    # Convert the probabilities into a DataFrame
    probabilities_df = pd.DataFrame(
        predicted_probabilities, columns=encoders['MOTIVAZIONE_PRESTITO'].classes_)

    print("Probabilities for each class:")
    print(probabilities_df.transpose())

except ValueError as e:
    if "unseen labels" in str(e):
        print(f"Error: {e}")

        # Add the new data to the existing training data
        for column in new_customer:
            if column in encoders_loaded:
                encoders_loaded[column].fit(
                    pd.concat([df[column], pd.Series([new_customer[column]])], axis=0))
                new_customer[column] = encoders_loaded[column].transform(
                    [new_customer[column]])[0]

        # Convert the new customer profile to a DataFrame
        new_customer_df = pd.DataFrame([new_customer])

        # Ensure the new customer dataframe has the same columns as the training set
        missing_cols = set(X.columns) - set(new_customer_df.columns)
        for col in missing_cols:
            new_customer_df[col] = 0
        new_customer_df = new_customer_df[X.columns]

        # Combine the new data with the existing training data
        X_train = pd.concat([X_train, new_customer_df])
        y_train = pd.concat([y_train, pd.Series(
            encoders['MOTIVAZIONE_PRESTITO'].transform(predicted_motivation_label))])

        # Retrain the model with the updated training data
        clf_loaded.fit(X_train, y_train)

        # Save the retrained model
        joblib.dump(clf_loaded, 'random_forest_model_retrained.pkl')
        joblib.dump(encoders_loaded, 'encoders_retrained.pkl')

        print("\nModel retrained and saved successfully.")
    else:
        raise e

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import joblib


# Path to your CSV file
#file_path = 'data_simulation_new1.csv'
file_path = 'data_simulation_new1.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Verify the column presence before any processing
print("\nColumns in original DataFrame:")
print(df.columns)


# Dropping columns that are not useful for the prediction
df.drop(columns=['NOME',
                 'COGNOME',
                 'TELEFONO',
                 'CELLULARE',
                 'EMAIL',
                 'CAP',
                 'INDIRIZZO',
                 'CODICE_FISCALE',
                 'IBAN',
                 'COMUNE_NASCITA',
                 'DATA_NASCITA',
                 'IMPORTO_STIPENDIO_PENSIONE',
                 'TFR',
                 'DATA_ ASSUNZIONE_PENSIONAMENTO',
                 'NOME_AZIENDA',
                 'CODICE_FISCALE_AZIENDA',
                 'PARTITA_IVA_AZIENDA',
                 'TEMPO_INDETERMINATO',
                 'PREVENTIVI_CONCORRENZA',
                 'TRATTENUTE_BUSTA_PAGA_PENSIONE',
                 'ALTRI_FINANZIAMENTI_ PRESENTI',
                 'DOCUMENTAZIONE_PENSIONATO',
                 'REGISTRAZIONE_TEL_PRIMO_CONTATTO',
                 'NOTE_LAVORAZIONE_CONTATTO'
                 ], inplace=True)

df.info()


Columns in original DataFrame:
Index(['NOME', 'COGNOME', 'IMPORTO_RICHIESTO', 'TELEFONO', 'CELLULARE',
       'TIPO DI OCCUPAZIONE', 'PROVINCIA', 'CONSENSO_DATI_PRIVACY',
       'CONSENSO_DATI_MRKTG', 'CONSENSO_DATI_CESSIONE_TERZI', 'SESSO', 'EMAIL',
       'REGIONE', 'COMUNE', 'CAP', 'INDIRIZZO', 'CODICE_FISCALE', 'IBAN',
       'COMUNE_NASCITA', 'DATA_NASCITA', 'AGE', 'anni lavorativi',
       'MOTIVAZIONE_PRESTITO', 'IMPORTO_STIPENDIO_PENSIONE', 'TFR',
       'DATA_ ASSUNZIONE_PENSIONAMENTO', 'NOME_AZIENDA', 'TIPO_AZIENDA',
       'CODICE_FISCALE_AZIENDA', 'PARTITA_IVA_AZIENDA', 'TEMPO_INDETERMINATO',
       'PREVENTIVI_CONCORRENZA', 'TRATTENUTE_BUSTA_PAGA_PENSIONE',
       'ALTRI_FINANZIAMENTI_ PRESENTI', 'DOCUMENTAZIONE_PENSIONATO',
       'REGISTRAZIONE_TEL_PRIMO_CONTATTO', 'NOTE_LAVORAZIONE_CONTATTO'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 13 columns):
 #   Column                        Non-Null Cou

In [63]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Assuming df is your DataFrame and it's already loaded
# df = pd.read_csv('your_data.csv')  # If you need to load data

# Encode categorical variables using LabelEncoder
encoders = {}
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        encoders[column] = le

# Split the target variable
y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best model
best_clf = grid_search.best_estimator_

# Save the best model to a binary file
joblib.dump(best_clf, 'random_forest_model.pkl')
joblib.dump(encoders, 'random_forest_encoders.pkl')

# Get feature importances
importances = best_clf.feature_importances_
feature_importances = pd.DataFrame(
    {'feature': X.columns, 'importance': importances})
feature_importances.sort_values(by='importance', ascending=False, inplace=True)

# Print the feature importances
print("\nFeature Importances:")
print(feature_importances)

# Predict on the test set using the best model
y_pred = best_clf.predict(X_test)

# Evaluate the model
print(f"\nAccuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:")
# Use zero_division=0 to handle the warning
print(classification_report(y_test, y_pred, zero_division=0))

# Print the best parameters found by GridSearchCV
print("\nBest parameters found by GridSearchCV:")
print(grid_search.best_params_)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


  _data = np.array(data, dtype=dtype, copy=copy,



Feature Importances:
                           feature  importance
12  DATA_ ASSUNZIONE_PENSIONAMENTO    0.147062
1              TIPO DI OCCUPAZIONE    0.101831
0                IMPORTO_RICHIESTO    0.092700
9                              AGE    0.089458
10                 anni lavorativi    0.089428
2                        PROVINCIA    0.088884
8                           COMUNE    0.085980
7                          REGIONE    0.072024
11      IMPORTO_STIPENDIO_PENSIONE    0.037149
16  TRATTENUTE_BUSTA_PAGA_PENSIONE    0.023529
5     CONSENSO_DATI_CESSIONE_TERZI    0.022606
6                            SESSO    0.022330
15          PREVENTIVI_CONCORRENZA    0.022277
14             TEMPO_INDETERMINATO    0.021961
3            CONSENSO_DATI_PRIVACY    0.021837
4              CONSENSO_DATI_MRKTG    0.021055
13                    TIPO_AZIENDA    0.020065
17   ALTRI_FINANZIAMENTI_ PRESENTI    0.019823

Accuracy: 0.08222222222222222

Classification Report:
              precision    rec

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import joblib


# Path to your CSV file
# file_path = 'data_simulation_new1.csv'
file_path = 'data_simulation_new1.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Dropping columns that are not useful for the prediction
df.drop(columns=['NOME',
                 'COGNOME',
                 'TELEFONO',
                 'CELLULARE',
                 'EMAIL',
                 'CAP',
                 'INDIRIZZO',
                 'CODICE_FISCALE',
                 'IBAN',
                 'COMUNE_NASCITA',
                 'DATA_NASCITA',
                 'IMPORTO_STIPENDIO_PENSIONE',
                 'TFR',
                 'DATA_ ASSUNZIONE_PENSIONAMENTO',
                 'NOME_AZIENDA',
                 'CODICE_FISCALE_AZIENDA',
                 'PARTITA_IVA_AZIENDA',
                 'TEMPO_INDETERMINATO',
                 'PREVENTIVI_CONCORRENZA',
                 'TRATTENUTE_BUSTA_PAGA_PENSIONE',
                 'ALTRI_FINANZIAMENTI_ PRESENTI',
                 'DOCUMENTAZIONE_PENSIONATO',
                 'REGISTRAZIONE_TEL_PRIMO_CONTATTO',
                 'NOTE_LAVORAZIONE_CONTATTO'
                 ], inplace=True)


y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])

# Encode categorical variables using LabelEncoder
encoders = {}
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        encoders[column] = le

# Split the target variable
y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Initialize and train the RandomForestClassifier

#clf = RandomForestClassifier(n_estimators=200, random_state=42)
#clf = RandomForestClassifier('bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2,'min_samples_split': 10, 'n_estimators': 200)
clf = RandomForestClassifier(bootstrap=True,max_depth=10,min_samples_leaf=1,min_samples_split=10,n_estimators=500)
clf.fit(X_train, y_train)

# Save the trained model to a binary file
joblib.dump(clf, 'random_forest_model.pkl')
joblib.dump(encoders, 'random_forest_encoders.pkl')

# Get feature importances
importances = clf.feature_importances_
feature_importances = pd.DataFrame(
    {'feature': X.columns, 'importance': importances})

feature_importances.sort_values(by='importance', ascending=False, inplace=True)

# Print the feature importances
print("\nFeature Importances:")
print(feature_importances)


# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"\nAccuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:")

#print(classification_report(y_test, y_pred))
# Use zero_division=0 to handle the warning
print(classification_report(y_test, y_pred, zero_division=0))


Feature Importances:
                         feature  importance
1            TIPO DI OCCUPAZIONE    0.136880
0              IMPORTO_RICHIESTO    0.131289
9                            AGE    0.121200
10               anni lavorativi    0.120140
2                      PROVINCIA    0.116375
8                         COMUNE    0.114260
7                        REGIONE    0.096076
4            CONSENSO_DATI_MRKTG    0.034740
5   CONSENSO_DATI_CESSIONE_TERZI    0.033901
3          CONSENSO_DATI_PRIVACY    0.033276
6                          SESSO    0.032746
11                  TIPO_AZIENDA    0.029118

Accuracy: 0.08666666666666667

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        94
           1       0.00      0.00      0.00        95
           2       0.00      0.00      0.00       108
           3       0.09      0.61      0.15       179
           4       0.10      0.02      0.03       123
           5 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import joblib


# Path to your CSV file
# file_path = 'data_simulation_new1.csv'
file_path = 'data_simulation_new1.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Dropping columns that are not useful for the prediction
df.drop(columns=['NOME',
                 'COGNOME',
                 'TELEFONO',
                 'CELLULARE',
                 'EMAIL',
                 'CAP',
                 'INDIRIZZO',
                 'CODICE_FISCALE',
                 'IBAN',
                 'COMUNE_NASCITA',
                 'DATA_NASCITA',
                 'IMPORTO_STIPENDIO_PENSIONE',
                 'TFR',
                 'DATA_ ASSUNZIONE_PENSIONAMENTO',
                 'NOME_AZIENDA',
                 'CODICE_FISCALE_AZIENDA',
                 'PARTITA_IVA_AZIENDA',
                 'TEMPO_INDETERMINATO',
                 'PREVENTIVI_CONCORRENZA',
                 'TRATTENUTE_BUSTA_PAGA_PENSIONE',
                 'ALTRI_FINANZIAMENTI_ PRESENTI',
                 'DOCUMENTAZIONE_PENSIONATO',
                 'REGISTRAZIONE_TEL_PRIMO_CONTATTO',
                 'NOTE_LAVORAZIONE_CONTATTO'
                 ], inplace=True)

# Split the target variable
y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])


# Example new customer profile
new_customer = {
    'IMPORTO_RICHIESTO': 11000,
    'TIPO DI OCCUPAZIONE': 'Interior Designer',
    'PROVINCIA': 'Treviso',
    'CONSENSO_DATI_PRIVACY': 'si',
    'CONSENSO_DATI_MRKTG': 'no',
    'CONSENSO_DATI_CESSIONE_TERZI': 'no',
    'SESSO': 'M',
    'REGIONE': 'Veneto',
    'COMUNE': 'Treviso',
    'AGE': 62,
    'anni lavorativi': 37,
    'TIPO_AZIENDA': 'Privata'
}


# Load the trained model and encoders
clf_loaded = joblib.load('random_forest_model.pkl')
encoders_loaded = joblib.load('random_forest_encoders.pkl')



# Encode the new customer profile using the loaded LabelEncoders
for column in new_customer:
    if column in encoders_loaded:
        new_customer[column] = encoders_loaded[column].transform([new_customer[column]])[0]

# Convert the new customer profile to a DataFrame
new_customer_df = pd.DataFrame([new_customer])


# Ensure the new customer dataframe has the same columns as the training set
missing_cols = set(X.columns) - set(new_customer_df.columns)
for col in missing_cols:
    new_customer_df[col] = 0
new_customer_df = new_customer_df[X.columns]


# Predict the MOTIVAZIONE_PRESTITO for the new customer using the loaded model
prediction = clf_loaded.predict(new_customer_df)
print(f"\nPredicted MOTIVAZIONE_PRESTITO for new customer: {prediction[0]}")


# Predict the probabilities
predicted_probabilities = clf_loaded.predict_proba(new_customer_df)


# Convert the probabilities into a DataFrame
probabilities_df = pd.DataFrame(
    predicted_probabilities, columns=encoders_loaded['MOTIVAZIONE_PRESTITO'].classes_)


print(f"Predicted loan motivation for the new customer: {predicted_probabilities[0]}")


print("Probabilities for each class:")
print(probabilities_df.transpose())


Predicted MOTIVAZIONE_PRESTITO for new customer: 12
Predicted loan motivation for the new customer: [0.03869092 0.04843309 0.03381679 0.09947997 0.05309593 0.02999857
 0.06271113 0.03314626 0.06424165 0.03597952 0.0335014  0.05343591
 0.10140831 0.06663295 0.07240879 0.06714125 0.04402397 0.06185359]
Probabilities for each class:
                                  0
Corsi/Specializzazioni     0.038691
Risarcimenti               0.048433
acquisto arredamento casa  0.033817
acquisto auto/moto         0.099480
acquisto immobili          0.053096
anticipo prima casa        0.029999
consolidamento debiti      0.062711
investimenti               0.033146
liquidità                  0.064242
non specificata            0.035980
pagamenti imposte e tasse  0.033501
rinegoziazione             0.053436
ristrutturazione casa      0.101408
spese dentistiche          0.066633
spese medico sanitarie     0.072409
spese per cerimonie        0.067141
spese universitarie        0.044024
spese viaggi       