In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import pickle 
import os 
import sys
import warnings

from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier      
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score 

sys.path.append(os.path.abspath(".."))
from cleaning_preprocess_ML.cleaning import datacleaning
from cleaning_preprocess_ML.cleaning import preprocess_data


In [2]:
path = os.getcwd()
chemin_repertoire = os.path.join(path, '../dataset')

# chemin des fichiers
chemin= os.path.join(chemin_repertoire, 'train_data.csv')

In [3]:
df = pd.read_csv(chemin)
df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df_cleaned = datacleaning(df)

  num_data_train.fillna(method='bfill', inplace=True)


In [5]:
df_cleaned.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status
0,5849,0.0,128.0,360.0,1.0,Male,No,0,Graduate,No,Urban,Y
1,4583,1508.0,128.0,360.0,1.0,Male,Yes,1,Graduate,No,Rural,N
2,3000,0.0,66.0,360.0,1.0,Male,Yes,0,Graduate,Yes,Urban,Y
3,2583,2358.0,120.0,360.0,1.0,Male,Yes,0,Not Graduate,No,Urban,Y
4,6000,0.0,141.0,360.0,1.0,Male,No,0,Graduate,No,Urban,Y


In [6]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ApplicantIncome    614 non-null    int64  
 1   CoapplicantIncome  614 non-null    float64
 2   LoanAmount         614 non-null    float64
 3   Loan_Amount_Term   614 non-null    float64
 4   Credit_History     614 non-null    float64
 5   Gender             614 non-null    object 
 6   Married            614 non-null    object 
 7   Dependents         614 non-null    object 
 8   Education          614 non-null    object 
 9   Self_Employed      614 non-null    object 
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [7]:
type(df_cleaned)

pandas.core.frame.DataFrame

#### Feature Engineering

In [8]:
y = df_cleaned['Loan_Status']
y.head(5)

0    Y
1    N
2    Y
3    Y
4    Y
Name: Loan_Status, dtype: object

In [10]:
X = df_cleaned.drop(columns=['Loan_Status'])  # Sans inplace=True
X.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender,Married,Dependents,Education,Self_Employed,Property_Area
0,5849,0.0,128.0,360.0,1.0,Male,No,0,Graduate,No,Urban
1,4583,1508.0,128.0,360.0,1.0,Male,Yes,1,Graduate,No,Rural
2,3000,0.0,66.0,360.0,1.0,Male,Yes,0,Graduate,Yes,Urban
3,2583,2358.0,120.0,360.0,1.0,Male,Yes,0,Not Graduate,No,Urban
4,6000,0.0,141.0,360.0,1.0,Male,No,0,Graduate,No,Urban


In [None]:
#def preprocess_data(dataframe): 

    #for col in dataframe.select_dtypes(include=['object']).columns:
        #le = LabelEncoder()
        #dataframe[col] = le.fit_transform(dataframe[col].astype(str))

    #return dataframe  # Retourne uniquement le DataFrame


In [11]:
print("Avant prétraitement :\n", X.shape)
X = preprocess_data(X)
print("\nAprès prétraitement :\n", X.shape)

Avant prétraitement :
 (614, 11)

Après prétraitement :
 (614, 11)


In [12]:
X.head(5)

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender,Married,Dependents,Education,Self_Employed,Property_Area
0,5849,0.0,128.0,360.0,1.0,1,0,0,0,0,2
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,1,2
3,2583,2358.0,120.0,360.0,1.0,1,1,0,1,0,2
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,2


In [13]:
print(type(X))

<class 'pandas.core.frame.DataFrame'>


In [14]:
y_value= {'Y':1, 'N':0}
y = y.map(y_value)

In [None]:
# division des données en données d'entrainement et données de test
# Vérifions que X et y ont bien la même taille
print("X shape:", X.shape)  # (614, n_features)
print("y shape:", y.shape)  # (614,)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train, test in sss.split(X, y):
    X_train, X_test = X.iloc[train], X.iloc[test]
    y_train, y_test = y.iloc[train], y.iloc[test]  

# Vérification des dimensions après split
print('X_train taille :', X_train.shape)
print('X_test taille :', X_test.shape)
print('y_train taille :', y_train.shape)    
print('y_test taille :', y_test.shape)


X shape: (614, 11)
y shape: (614,)
X_train taille : (491, 11)
X_test taille : (123, 11)
y_train taille : (491,)
y_test taille : (123,)


In [None]:
## on va appliquer trois algorithmes de classification : Logistic Regression, KNN et Decision Tree

In [182]:
#from sklearn.linear_model import LogisticRegression
#from sklearn.neighbors import KNeighborsClassifier      
#from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score 

In [19]:
models = {
    "LogisticRegression": LogisticRegression(random_state=42),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=1, random_state=42)
}
# la fonction de precision 

def accuracy(y_true, y_pred, retu=False):
    accuracy = accuracy_score(y_true, y_pred)
    if retu:
        return accuracy
    else:
        print(f"La précision du modèle est: {accuracy}")
    return accuracy_score(y_true, y_pred)

# fonction d'application des modèles
def train_test_eval(models, X_train, y_train, X_test, y_test):
    for name, model in models.items():
        print(name,":")
        model.fit(X_train, y_train)
        accuracy(y_test, model.predict(X_test))
        print("-"*30)

In [184]:
train_test_eval(models, X_train, y_train, X_test, y_test)

LogisticRegression :
La précision du modèle est: 0.8536585365853658
------------------------------
KNeighborsClassifier :
La précision du modèle est: 0.6504065040650406
------------------------------
DecisionTreeClassifier :
La précision du modèle est: 0.8455284552845529
------------------------------


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Application 

Pour Application 
On va choire quelques features important à la variable cible `Loan_Status`
 
'Credit_History', 'Gender', 'Married','ApplicantIncome', 'CoapplicantIncome'.

In [187]:
#X.columns
X_2 = X[['Credit_History', 'Gender', 'Married','ApplicantIncome', 'CoapplicantIncome']]

In [189]:
# division des données en données d'entrainement et données de test
# Vérifions que X et y ont bien la même taille
print("X shape:", X_2.shape)  # (614, n_features)
print("y shape:", y.shape)  # (614,)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train, test in sss.split(X_2, y):
    X_train, X_test = X_2.iloc[train], X_2.iloc[test]
    y_train, y_test = y.iloc[train], y.iloc[test]  

# Vérification des dimensions après split
print('X_train taille :', X_train.shape)
print('X_test taille :', X_test.shape)
print('y_train taille :', y_train.shape)    
print('y_test taille :', y_test.shape)

X shape: (614, 5)
y shape: (614,)
X_train taille : (491, 5)
X_test taille : (123, 5)
y_train taille : (491,)
y_test taille : (123,)


In [190]:
train_test_eval(models, X_train, y_train, X_test, y_test)

LogisticRegression :
La précision du modèle est: 0.8536585365853658
------------------------------
KNeighborsClassifier :
La précision du modèle est: 0.6585365853658537
------------------------------
DecisionTreeClassifier :
La précision du modèle est: 0.8455284552845529
------------------------------


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
X_3 = X[['Credit_History',  'Married', 'CoapplicantIncome']]

In [21]:
# division des données en données d'entrainement et données de test
# Vérifions que X et y ont bien la même taille
print("X shape:", X_3.shape)  # (614, n_features)
print("y shape:", y.shape)  # (614,)

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train, test in sss.split(X_3, y):
    X_train, X_test = X_3.iloc[train], X_3.iloc[test]
    y_train, y_test = y.iloc[train], y.iloc[test]  

# Vérification des dimensions après split
print('X_train taille :', X_train.shape)
print('X_test taille :', X_test.shape)
print('y_train taille :', y_train.shape)    
print('y_test taille :', y_test.shape)

X shape: (614, 3)
y shape: (614,)
X_train taille : (491, 3)
X_test taille : (123, 3)
y_train taille : (491,)
y_test taille : (123,)


In [22]:
train_test_eval(models, X_train, y_train, X_test, y_test)

LogisticRegression :
La précision du modèle est: 0.8536585365853658
------------------------------
KNeighborsClassifier :
La précision du modèle est: 0.6991869918699187
------------------------------
DecisionTreeClassifier :
La précision du modèle est: 0.8455284552845529
------------------------------


Nous allons garder la regression logistic 

In [23]:
# application du modèle Logistic Regression
Classifier = LogisticRegression()
Classifier.fit(X_3, y)

In [24]:
# enregistrement du modèle
pickle.dump(Classifier, open('model.pkl1', 'wb'))