## app_API

In [1]:
import os
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from scipy.stats import chi2_contingency


%matplotlib inline
sns.set_theme(color_codes=True)

warnings.filterwarnings('ignore')

In [2]:
path = os.getcwd()
chemin_repertoire = os.path.join(path, '../Data')

# chemin fichier
chemin_data = os.path.join(chemin_repertoire, 'Credit_cleaned.csv')

df = pd.read_csv(chemin_data)
df.head()

Unnamed: 0,Durations,Credit history,Credit_Purpose,Credit amount,Installment_Rate_Percent,Other debtors / guarantors,Residence_Years,Property,Age in years,Other_Inst_Plans,...,Num_Existing_Credits,Job,Liable_People,Telephone,Foreign worker,Checking_Account_Status,Gender,Status,Years_Employments,Saving_Status
0,6,critical/other existing credit,radio/tv,1169,4,none,4,real estate,67,none,...,2,skilled,1,yes,yes,good,male,single,7 or more years,no savings
1,48,existing paid,radio/tv,5951,2,none,2,real estate,22,none,...,1,skilled,1,none,yes,bad,female,div/dep/mar,1-3 years,less than 100
2,12,critical/other existing credit,education,2096,2,none,3,real estate,49,none,...,1,unskilled_resident,2,none,yes,good,male,single,4-6 years,less than 100
3,42,existing paid,furniture/equipment,7882,2,guarantor,4,life insurance,45,none,...,1,skilled,2,none,yes,good,male,single,4-6 years,less than 100
4,24,delayed previously,new car,4870,3,none,4,no known property,53,none,...,2,skilled,2,none,yes,bad,male,single,1-3 years,less than 100


In [3]:
df.columns

Index(['Durations', 'Credit history', 'Credit_Purpose', 'Credit amount',
       'Installment_Rate_Percent', 'Other debtors / guarantors',
       'Residence_Years', 'Property', 'Age in years', 'Other_Inst_Plans',
       'Housing', 'Num_Existing_Credits', 'Job', 'Liable_People', 'Telephone',
       'Foreign worker', 'Checking_Account_Status', 'Gender', 'Status',
       'Years_Employments', 'Saving_Status'],
      dtype='object')

Selection de quelques variables pertinentes 

In [4]:
df = df[['Credit_Purpose', 'Credit history', 'Credit amount', 'Checking_Account_Status', 'Status',  'Saving_Status']]
df

Unnamed: 0,Credit_Purpose,Credit history,Credit amount,Checking_Account_Status,Status,Saving_Status
0,radio/tv,critical/other existing credit,1169,good,single,no savings
1,radio/tv,existing paid,5951,bad,div/dep/mar,less than 100
2,education,critical/other existing credit,2096,good,single,less than 100
3,furniture/equipment,existing paid,7882,good,single,less than 100
4,new car,delayed previously,4870,bad,single,less than 100
...,...,...,...,...,...,...
995,furniture/equipment,existing paid,1736,good,div/dep/mar,less than 100
996,used car,existing paid,3857,good,div/sep,less than 100
997,radio/tv,existing paid,804,good,single,less than 100
998,radio/tv,existing paid,1845,bad,single,less than 100


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Credit_Purpose           1000 non-null   object
 1   Credit history           1000 non-null   object
 2   Credit amount            1000 non-null   int64 
 3   Checking_Account_Status  1000 non-null   object
 4   Status                   1000 non-null   object
 5   Saving_Status            1000 non-null   object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB


In [6]:
for col in df.select_dtypes(include=['object', 'category']).columns:
    print(f"Modalités de {col} :")
    print(df[col].value_counts())  # Ajoute normalize=True pour voir les proportions
    print("-" * 40)

Modalités de Credit_Purpose :
Credit_Purpose
radio/tv               280
new car                234
furniture/equipment    181
used car               103
business                97
education               50
repairs                 22
domestic appliance      12
other                   12
retraining               9
Name: count, dtype: int64
----------------------------------------
Modalités de Credit history :
Credit history
existing paid                     530
critical/other existing credit    293
delayed previously                 88
all paid                           49
no credits/all paid                40
Name: count, dtype: int64
----------------------------------------
Modalités de Checking_Account_Status :
Checking_Account_Status
good    700
bad     300
Name: count, dtype: int64
----------------------------------------
Modalités de Status :
Status
single         548
div/dep/mar    310
mar/wid         92
div/sep         50
Name: count, dtype: int64
-------------------------------

In [30]:
df.columns

Index(['Credit_Purpose', 'Credit_history', 'Credit_amount', 'Account_status',
       'Status', 'Saving_Status'],
      dtype='object')

In [7]:
df = df.rename(columns= {
    'Credit_Purpose' : 'Credit_Purpose',
    'Credit history' : 'Credit_history',
    'Checking_Account_Status' : 'Account_status',
    'Credit amount': 'Credit_amount'
    
})

In [8]:
df.columns = df.columns.str.lower()

#df.columns = df.columns.str.lower().str.replace(" ", "_")


In [9]:
df.head(2)

Unnamed: 0,credit_purpose,credit_history,credit_amount,account_status,status,saving_status
0,radio/tv,critical/other existing credit,1169,good,single,no savings
1,radio/tv,existing paid,5951,bad,div/dep/mar,less than 100


In [10]:
# Regroupement pour Credit_Purpose
df['credit_purpose_grouped'] = df['credit_purpose'].replace({
    'radio/tv': 'electronics',
    'domestic appliance': 'electronics',
    'furniture/equipment': 'household',
    'repairs': 'household',
    'new car': 'car',
    'used car': 'car',
    'education': 'education',
    'retraining': 'education',
    'business': 'business',
    'other': 'other'
})


# Regroupement pour Credit history
df['credit_history_grouped'] = df['credit_history'].replace({
    'no credits/all paid': 'good',
    'all paid': 'good',
    'existing paid': 'average',
    'delayed previously': 'poor',
    'critical/other existing credit': 'critical'
})

# Regroupement pour Status
df['status_grouped'] = df['status'].replace({
    'single': 'single',
    'div/dep/mar': 'not_single',
    'mar/wid': 'not_single',
    'div/sep': 'not_single'
})

# Regroupement pour Saving_Status
df['saving_status_grouped'] = df['saving_status'].replace({
    'no savings': 'low',
    'less than 100': 'low',
    '100-500': 'medium',
    '500-1000': 'high',
    'more than 1000': 'high'
})


# Suppression des anciennes colonnes après regroupement
df = df.drop(columns= ['credit_purpose', 'credit_history', 'status', 'saving_status'])

# Vérification rapide
print("Nouvelles colonnes ajoutées :", ['credit_purpose_grouped','credit_history_grouped', 'status_grouped', 'saving_status_grouped'])


Nouvelles colonnes ajoutées : ['credit_purpose_grouped', 'credit_history_grouped', 'status_grouped', 'saving_status_grouped']


In [11]:
df.columns

Index(['credit_amount', 'account_status', 'credit_purpose_grouped',
       'credit_history_grouped', 'status_grouped', 'saving_status_grouped'],
      dtype='object')

In [12]:
df

Unnamed: 0,credit_amount,account_status,credit_purpose_grouped,credit_history_grouped,status_grouped,saving_status_grouped
0,1169,good,electronics,critical,single,low
1,5951,bad,electronics,average,not_single,low
2,2096,good,education,critical,single,low
3,7882,good,household,average,single,low
4,4870,bad,car,poor,single,low
...,...,...,...,...,...,...
995,1736,good,household,average,not_single,low
996,3857,good,car,average,not_single,low
997,804,good,electronics,average,single,low
998,1845,bad,electronics,average,single,low


In [13]:
df = df.rename(columns= {
    'credit_purpose_grouped' : 'credit_purpose',
    'credit_history_grouped' : 'credit_history',
    'status_grouped': 'status',
    'saving_status_grouped' : 'saving_status'
    
})

In [14]:
df.columns

Index(['credit_amount', 'account_status', 'credit_purpose', 'credit_history',
       'status', 'saving_status'],
      dtype='object')

In [19]:
#df.to_csv('df_api.csv', index=False)

### Modélisation 

In [40]:
df_prep = df.copy()
df_prep.head(5)

Unnamed: 0,credit_amount,account_status,credit_purpose,credit_history,status,saving_status
0,1169,good,electronics,critical,single,low
1,5951,bad,electronics,average,not_single,low
2,2096,good,education,critical,single,low
3,7882,good,household,average,single,low
4,4870,bad,car,poor,single,low


In [42]:
df_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   credit_amount   1000 non-null   int64 
 1   account_status  1000 non-null   object
 2   credit_purpose  1000 non-null   object
 3   credit_history  1000 non-null   object
 4   status          1000 non-null   object
 5   saving_status   1000 non-null   object
dtypes: int64(1), object(5)
memory usage: 47.0+ KB


In [43]:
for col in df_prep.select_dtypes(include=['object', 'category']).columns:
    print(f"Modalités de {col} :")
    print(df_prep[col].value_counts())  # Ajoute normalize=True pour voir les proportions
    print("-" * 40)

Modalités de account_status :
account_status
good    700
bad     300
Name: count, dtype: int64
----------------------------------------
Modalités de credit_purpose :
credit_purpose
car            337
electronics    292
household      203
business        97
education       59
other           12
Name: count, dtype: int64
----------------------------------------
Modalités de credit_history :
credit_history
average     530
critical    293
good         89
poor         88
Name: count, dtype: int64
----------------------------------------
Modalités de status :
status
single        548
not_single    452
Name: count, dtype: int64
----------------------------------------
Modalités de saving_status :
saving_status
low       786
high      111
medium    103
Name: count, dtype: int64
----------------------------------------


In [None]:
#cat =df_prep.select_dtypes(include=['object', 'category']).columns
#num= df_prep.select_dtypes(include=['int', 'float']).columns

In [47]:
cat_features = ['credit_purpose', 'credit_history', 'status','saving_status']
num_features = ['credit_amount']

# Normalisation des variables numériques
scaler = StandardScaler()
df_prep[num_features] = scaler.fit_transform(df_prep[num_features])

# Encodage One-Hot des variables catégorielles
df_prep = pd.get_dummies(df_prep, columns=cat_features, drop_first=True)

In [48]:
# Variable cible
y = df_prep['account_status'].apply(lambda x: 1 if x == 'bad' else 0)

# Variables explicatives
X = df_prep.drop(columns=['account_status'])

In [None]:
#y.value_counts()

account_status
0    700
1    300
Name: count, dtype: int64

In [51]:
df_prep.head(2)

Unnamed: 0,credit_amount,account_status,credit_purpose_car,credit_purpose_education,credit_purpose_electronics,credit_purpose_household,credit_purpose_other,credit_history_critical,credit_history_good,credit_history_poor,status_single,saving_status_low,saving_status_medium
0,-0.745131,good,False,False,True,False,False,True,False,False,True,True,False
1,0.949817,bad,False,False,True,False,False,False,False,False,False,True,False


In [53]:
X.columns

Index(['credit_amount', 'credit_purpose_car', 'credit_purpose_education',
       'credit_purpose_electronics', 'credit_purpose_household',
       'credit_purpose_other', 'credit_history_critical',
       'credit_history_good', 'credit_history_poor', 'status_single',
       'saving_status_low', 'saving_status_medium'],
      dtype='object')

In [54]:
print('taille de X :',  X.shape)
print('taille de y :',  y.shape)

taille de X : (1000, 12)
taille de y : (1000,)


In [None]:
selected_features = [
    'credit_amount',
    'credit_purpose_car',
    'credit_history_critical',
    'status_single',
    'saving_status_low',
    'credit_purpose_other'
]

In [59]:
y.value_counts()

account_status
0    700
1    300
Name: count, dtype: int64

In [57]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_sel = X[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('LogisticRegression acc:',accuracy )
print(classification_report(y_test, y_pred))


LogisticRegression acc: 0.715
              precision    recall  f1-score   support

           0       0.72      0.99      0.83       141
           1       0.67      0.07      0.12        59

    accuracy                           0.71       200
   macro avg       0.69      0.53      0.48       200
weighted avg       0.70      0.71      0.62       200



In [60]:
import joblib

joblib.dump(model, "log_reg.pkl")
joblib.dump(scaler, "scaler1.pkl")
joblib.dump(X_train.columns.tolist(), "features_columns1.pkl")


['features_columns1.pkl']

In [65]:
b = joblib.load("features_columns1.pkl")


In [66]:
b

['credit_amount',
 'credit_purpose_car',
 'credit_history_critical',
 'status_single',
 'saving_status_low',
 'credit_purpose_other']