In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import joblib


# Path to your CSV file
# file_path = 'data_simulation_new1.csv'
file_path = 'data_simulation_new1.csv'
# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Verify the column presence before any processing
print("\nColumns in original DataFrame:")
print(df.columns)


# Dropping columns that are not useful for the prediction
df.drop(columns=['NOME',
                 'COGNOME',
                 'TELEFONO',
                 'CELLULARE',
                 'EMAIL',
                 'CAP',
                 'INDIRIZZO',
                 'CODICE_FISCALE',
                 'IBAN',
                 'COMUNE_NASCITA',
                 'DATA_NASCITA',
                 'TFR',
                 'NOME_AZIENDA',
                 'CODICE_FISCALE_AZIENDA',
                 'PARTITA_IVA_AZIENDA',
                 'DOCUMENTAZIONE_PENSIONATO',
                 'REGISTRAZIONE_TEL_PRIMO_CONTATTO',
                 'NOTE_LAVORAZIONE_CONTATTO'
                 ], inplace=True)

df.info()


Columns in original DataFrame:
Index(['NOME', 'COGNOME', 'IMPORTO_RICHIESTO', 'TELEFONO', 'CELLULARE',
       'TIPO DI OCCUPAZIONE', 'PROVINCIA', 'CONSENSO_DATI_PRIVACY',
       'CONSENSO_DATI_MRKTG', 'CONSENSO_DATI_CESSIONE_TERZI', 'SESSO', 'EMAIL',
       'REGIONE', 'COMUNE', 'CAP', 'INDIRIZZO', 'CODICE_FISCALE', 'IBAN',
       'COMUNE_NASCITA', 'DATA_NASCITA', 'AGE', 'anni lavorativi',
       'MOTIVAZIONE_PRESTITO', 'IMPORTO_STIPENDIO_PENSIONE', 'TFR',
       'DATA_ ASSUNZIONE_PENSIONAMENTO', 'NOME_AZIENDA', 'TIPO_AZIENDA',
       'CODICE_FISCALE_AZIENDA', 'PARTITA_IVA_AZIENDA', 'TEMPO_INDETERMINATO',
       'PREVENTIVI_CONCORRENZA', 'TRATTENUTE_BUSTA_PAGA_PENSIONE',
       'ALTRI_FINANZIAMENTI_ PRESENTI', 'DOCUMENTAZIONE_PENSIONATO',
       'REGISTRAZIONE_TEL_PRIMO_CONTATTO', 'NOTE_LAVORAZIONE_CONTATTO'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 19 columns):
 #   Column                          Non-Null C

In [None]:
# Encode categorical variables using LabelEncoder
encoders = {}
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        encoders[column] = le


# Split the target variable
y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.05, random_state=42)

# Initialize and train the RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=200, random_state=42)
# clf = RandomForestClassifier('bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2,'min_samples_split': 10, 'n_estimators': 200)
clf = RandomForestClassifier(bootstrap=True, max_depth=5,
                             min_samples_leaf=2, min_samples_split=5, n_estimators=200)

clf.fit(X_train, y_train)


# Save the trained model to a binary file
joblib.dump(clf, 'random_forest_model.pkl')
joblib.dump(encoders, 'encoders.pkl')

# Get feature importances
importances = clf.feature_importances_
feature_importances = pd.DataFrame(
    {'feature': X.columns, 'importance': importances})
feature_importances.sort_values(by='importance', ascending=False, inplace=True)

# Print the feature importances
print("\nFeature Importances:")
print(feature_importances)

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"\nAccuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:")
# print(classification_report(y_test, y_pred))
# Use zero_division=0 to handle the warning
print(classification_report(y_test, y_pred, zero_division=0))

In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Assuming df is your DataFrame and it's already loaded
# df = pd.read_csv('your_data.csv')  # If you need to load data

# Encode categorical variables using LabelEncoder
encoders = {}
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        encoders[column] = le

# Split the target variable
y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

# Initialize and train the MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(600,100), max_iter=600, random_state=42,activation='relu')
clf.fit(X_train, y_train)

# Save the trained model to a binary file
joblib.dump(clf, 'mlp_model.pkl')
joblib.dump(encoders, 'mlp_encoders.pkl')

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print(f"\nAccuracy: {accuracy_score(y_test, y_pred)}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


Accuracy: 0.072

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        59
           1       0.00      0.00      0.00        68
           2       0.00      0.00      0.00        84
           3       0.07      1.00      0.13       108
           4       0.00      0.00      0.00        80
           5       0.00      0.00      0.00        84
           6       0.00      0.00      0.00        89
           7       0.00      0.00      0.00        50
           8       0.00      0.00      0.00        80
           9       0.00      0.00      0.00        61
          10       0.00      0.00      0.00        77
          11       0.00      0.00      0.00        78
          12       0.00      0.00      0.00       121
          13       0.00      0.00      0.00       111
          14       0.00      0.00      0.00       120
          15       0.00      0.00      0.00        94
          16       0.00      0.00      0

In [33]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten
from tensorflow.keras.utils import to_categorical

# Assuming df is your DataFrame and it's already loaded
# df = pd.read_csv('your_data.csv')  # If you need to load data

# Encode categorical variables using LabelEncoder
encoders = {}
for column in df.columns:
    if df[column].dtype == 'object' or df[column].dtype.name == 'category':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        encoders[column] = le

# Split the target variable
y = df['MOTIVAZIONE_PRESTITO']
X = df.drop(columns=['MOTIVAZIONE_PRESTITO'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=42)

# Reshape the data to fit a CNN
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

# Convert target variable to categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# Initialize and train the CNN
model = Sequential()
model.add(Conv1D(64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=50, verbose=1,validation_data=(X_test, y_test))

# Save the trained model and encoders
model.save('cnn_model.h5')
joblib.dump(encoders, 'cnn_encoders.pkl')

# Predict on the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Evaluate the model
print(f"\nAccuracy: {accuracy_score(y_test_classes, y_pred_classes)}")
print("\nClassification Report:")
print(classification_report(y_test_classes, y_pred_classes, zero_division=0))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
  1/235 [..............................] - ETA: 20s

  saving_api.save_model(



Accuracy: 0.08786666666666666

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       314
           1       0.00      0.00      0.00       302
           2       0.00      0.00      0.00       338
           3       0.09      1.00      0.16       659
           4       0.00      0.00      0.00       456
           5       0.00      0.00      0.00       362
           6       0.00      0.00      0.00       479
           7       0.00      0.00      0.00       240
           8       0.00      0.00      0.00       415
           9       0.00      0.00      0.00       355
          10       0.00      0.00      0.00       360
          11       0.00      0.00      0.00       332
          12       0.00      0.00      0.00       586
          13       0.00      0.00      0.00       539
          14       0.00      0.00      0.00       543
          15       0.00      0.00      0.00       463
          16       0.00   

In [3]:


# Example new customer profile
new_customer = {
    'IMPORTO_RICHIESTO': 19000,
    'TIPO DI OCCUPAZIONE': 'Insegnante',
    'PROVINCIA': 'Roma',
    'CONSENSO_DATI_PRIVACY': 'si',
    'CONSENSO_DATI_MRKTG': 'si',
    'CONSENSO_DATI_CESSIONE_TERZI': 'si',
    'SESSO': 'F',
    'REGIONE': 'Lazio',
    'IMPORTO_STIPENDIO_PENSIONE': 3000,
    # 'AGE_Category': '41-50',
    'anni_lavorativi_Category': '41',
    # 'TFR_Category': "Basso",
    'TIPO_AZIENDA': 'Privata',
    'TEMPO_INDETERMINATO': 'no',
    # 'PREVENTIVI_CONCORRENZA': 0,
    # 'TRATTENUTE_BUSTA_PAGA_PENSIONE': 0,
    'ALTRI_FINANZIAMENTI_PRESENTI': 'no'
}


# Load the trained model and encoders
clf_loaded = joblib.load('random_forest_model.pkl')
encoders_loaded = joblib.load('encoders.pkl')

# Encode the new customer profile using the loaded LabelEncoders
for column in new_customer:
    if column in encoders_loaded:
        new_customer[column] = encoders_loaded[column].transform(
            [new_customer[column]])[0]

# Convert the new customer profile to a DataFrame
new_customer_df = pd.DataFrame([new_customer])

# Ensure the new customer dataframe has the same columns as the training set
missing_cols = set(X.columns) - set(new_customer_df.columns)
for col in missing_cols:
    new_customer_df[col] = 0
new_customer_df = new_customer_df[X.columns]

# Predict the MOTIVAZIONE_PRESTITO for the new customer using the loaded model
prediction = clf_loaded.predict(new_customer_df)
print(f"\nPredicted MOTIVAZIONE_PRESTITO for new customer: {prediction[0]}")


# Predict the probabilities
predicted_probabilities = clf.predict_proba(new_customer_df)

# Convert the probabilities into a DataFrame
probabilities_df = pd.DataFrame(
    predicted_probabilities, columns=encoders['MOTIVAZIONE_PRESTITO'].classes_)

print(
    f"Predicted loan motivation for the new customer: {predicted_probabilities[0]}")

print("Probabilities for each class:")
print(probabilities_df.transpose())


Predicted MOTIVAZIONE_PRESTITO for new customer: 13
Predicted loan motivation for the new customer: [0.04138704 0.03289137 0.04580072 0.08365169 0.05190229 0.05124269
 0.07431814 0.02794732 0.04558194 0.04361251 0.03986902 0.05099939
 0.07932215 0.09944855 0.07660744 0.05589893 0.0442191  0.05529971]
Probabilities for each class:
                                  0
Corsi/Specializzazioni     0.041387
Risarcimenti               0.032891
acquisto arredamento casa  0.045801
acquisto auto/moto         0.083652
acquisto immobili          0.051902
anticipo prima casa        0.051243
consolidamento debiti      0.074318
investimenti               0.027947
liquidità                  0.045582
non specificata            0.043613
pagamenti imposte e tasse  0.039869
rinegoziazione             0.050999
ristrutturazione casa      0.079322
spese dentistiche          0.099449
spese medico sanitarie     0.076607
spese per cerimonie        0.055899
spese universitarie        0.044219
spese viaggi       

In [34]:
import pandas as pd
import numpy as np
import joblib
from tensorflow.keras.models import load_model

# Example new customer profile
new_customer = {
    'IMPORTO_RICHIESTO': 19000,
    'TIPO DI OCCUPAZIONE': 'Poliziotto',
    'PROVINCIA': 'Roma',
    'CONSENSO_DATI_PRIVACY': 'si',
    'CONSENSO_DATI_MRKTG': 'si',
    'CONSENSO_DATI_CESSIONE_TERZI': 'si',
    'SESSO': 'F',
    'REGIONE': 'Lazio',
    'IMPORTO_STIPENDIO_PENSIONE': 3000,
    # 'AGE_Category': '41-50',
    'anni_lavorativi_Category': '41',
    # 'TFR_Category': "Basso",
    'TIPO_AZIENDA': 'Privata',
    'TEMPO_INDETERMINATO': 'no',
    # 'PREVENTIVI_CONCORRENZA': 0,
    # 'TRATTENUTE_BUSTA_PAGA_PENSIONE': 0,
    'ALTRI_FINANZIAMENTI_PRESENTI': 'no'
}

# Load the trained CNN model and encoders
clf_loaded = load_model('cnn_model.h5')
encoders_loaded = joblib.load('cnn_encoders.pkl')

# Encode the new customer profile using the loaded LabelEncoders
for column in new_customer:
    if column in encoders_loaded:
        new_customer[column] = encoders_loaded[column].transform([new_customer[column]])[0]

# Convert the new customer profile to a DataFrame
new_customer_df = pd.DataFrame([new_customer])

# Ensure the new customer dataframe has the same columns as the training set
missing_cols = set(X.columns) - set(new_customer_df.columns)
for col in missing_cols:
    new_customer_df[col] = 0
new_customer_df = new_customer_df[X.columns]

# Reshape the new customer data to match the input shape of the CNN
new_customer_array = np.expand_dims(new_customer_df.values, axis=2).astype('float32')

# Predict the MOTIVAZIONE_PRESTITO for the new customer using the loaded model
prediction = clf_loaded.predict(new_customer_array)
predicted_class = np.argmax(prediction, axis=1)[0]

print(f"\nPredicted MOTIVAZIONE_PRESTITO for new customer: {predicted_class}")

# Predict the probabilities
predicted_probabilities = clf_loaded.predict(new_customer_array)

# Convert the probabilities into a DataFrame
probabilities_df = pd.DataFrame(
    predicted_probabilities, columns=encoders_loaded['MOTIVAZIONE_PRESTITO'].classes_)

print(f"Predicted loan motivation probabilities for the new customer:")
print(probabilities_df.transpose())

ValueError: could not convert string to float: 'Poliziotto'