### Import Libraries
First, we need to import the necessary libraries.

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score

### Load and Preview the Data
Load the dataset and take a quick look at the first few rows, the data info, and descriptive statistics.

In [19]:
# Load and preview the data
file_path = './loan_rfm.csv'  # Update path as needed
df = pd.read_csv(file_path)
# print("Data preview:\n", df.head())
# print("\nData info:")
# print(df.info())
print("\nDescriptive statistics:")
print(df.describe())


Descriptive statistics:
       customer_id_int      Recency    Frequency     Monetary  recency_score  \
count      1000.000000  1000.000000  1000.000000   1000.00000     1000.00000   
mean        500.500000    35.354000    10.000000  24877.57597        3.02700   
std         288.819436    33.996742     3.076925   9002.90459        1.41855   
min           1.000000     1.000000     1.000000    451.58000        1.00000   
25%         250.750000    10.750000     8.000000  18597.53000        2.00000   
50%         500.500000    24.000000    10.000000  24264.98000        3.00000   
75%         750.250000    51.000000    12.000000  30731.06000        4.00000   
max        1000.000000   260.000000    24.000000  53896.74000        5.00000   

       frequency_score  monetary_score           R            F            M  \
count      1000.000000     1000.000000  1000.00000  1000.000000  1000.000000   
mean          3.000000        3.000000     3.02700     2.842000     3.000000   
std           

### Define the Target Variable
Create a binary target variable based on the `RFM_Score`.

In [20]:
# Define target variable
df['target'] = df['RFM_Score'].apply(lambda x: 1 if x >= 10 else 0)
print("\nTarget variable distribution:")
print(df['target'].value_counts())


Target variable distribution:
target
0    559
1    441
Name: count, dtype: int64


### Define Features and Target
Select the features and the target variable for the model.

In [21]:
# Define features and target
X = df[['Recency', 'Frequency', 'Monetary', 'recency_score', 'frequency_score', 'monetary_score']]
y = df['target']


### Split Data into Training and Test Sets
Split the data into training and test sets.

In [22]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Scale the Features
Standardize the features to have a mean of 0 and a standard deviation of 1.

In [23]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Hyperparameter Tuning with GridSearchCV
Use GridSearchCV to find the best parameters for the logistic regression model.

In [24]:
# Initialize and train the model with improved parameters using GridSearchCV for hyperparameter tuning
param_grid = {
    'solver': ['lbfgs', 'liblinear'],
    'class_weight': ['balanced', None],
    'C': [0.1, 1.0, 10.0]
}
grid_search = GridSearchCV(LogisticRegression(max_iter=500), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_
print("\nBest model parameters:", grid_search.best_params_)


Best model parameters: {'C': 10.0, 'class_weight': None, 'solver': 'lbfgs'}


### Model Evaluation
Evaluate the model using accuracy and precision scores, and display the classification report and confusion matrix.

In [25]:
# Make predictions and evaluate
y_pred = best_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model Precision: {precision * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 99.67%
Model Precision: 99.25%

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00       167
           1       0.99      1.00      1.00       133

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



In [26]:
# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Confusion Matrix:
[[166   1]
 [  0 133]]


### Add Probability Column
Add a column with the probability values of the logistic regression, rounded to two decimal places.

In [27]:
# Add a column with the probability values of the logistic regression (rounded to two decimals)
df['probability'] = best_model.predict_proba(scaler.transform(X))[:, 1]
df['probability'] = df['probability'].round(2)
print("\nData with probability values:\n", df.head())


Data with probability values:
    customer_id_int  Recency  Frequency  Monetary  recency_score  \
0                1       48         10  14692.31              2   
1                2        7         12  30247.14              5   
2                3      137          7  21466.88              1   
3                4        6         17  42961.48              5   
4                5       34          9  24022.56              2   

   frequency_score  monetary_score  R  F  M RFM_Segment  RFM_Score  target  \
0                3               1  2  3  1   2.03.01.0          6       0   
1                4               4  5  4  4   5.04.04.0         13       1   
2                1               2  1  1  2   1.01.02.0          4       0   
3                5               5  5  5  5   5.05.05.0         15       1   
4                2               3  2  2  3   2.02.03.0          7       0   

   probability  
0          0.0  
1          1.0  
2          0.0  
3          1.0  
4          

### Save the Updated DataFrame
Save the updated DataFrame to a new CSV file.

In [28]:
# Save the updated dataframe to a new CSV file
df.to_csv('loan_rfm_with_probabilities.csv', index=False)
print("\nUpdated data saved to 'loan_rfm_with_probabilities.csv'.")


Updated data saved to 'loan_rfm_with_probabilities.csv'.
