In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 

ModuleNotFoundError: No module named 'pandas'

In [None]:
data = pd.read_csv("german.csv")

In [None]:
df = data.copy()

In [None]:
df

In [None]:
df.shape

# TARGET

In [None]:
df['Credit'].value_counts(normalize=True)

In [None]:
df['Credit'].replace([1, 2], ['Good', 'Bad'], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.dtypes.value_counts()

In [None]:
for col in df.select_dtypes('object'):
    print(f'{col:-<50}\n{df[col].unique()}')

# VARIABLES QUALITATIVES 


In [None]:
for col in df.select_dtypes('object'):
    print(f'{col:-<50}\n{df[col].value_counts(normalize=True)}')
    plt.figure(figsize=(5,5))
    sns.countplot(data=df, x=df[col])
    plt.savefig("{}_qual".format(col))
    

# VARIABLES QUANTITAVES 

In [None]:
for col in df.select_dtypes('int64'):
    plt.figure(figsize=(5,5))
    sns.displot(data=df, x=df[col])
    plt.savefig("{}_histo".format(col))
    

In [None]:
for col in df.select_dtypes('int64'):
    plt.figure(figsize=(5,5))
    sns.boxplot(data=df, x=df[col])
    plt.savefig("{}_box".format(col))
    

# VARIABLES QUANTITATIVES ET TARGET 

In [None]:
for col in df.select_dtypes('int64'):
    plt.figure(figsize=(5,5))
    sns.boxplot(data=df, x=df[col], y="Credit", hue='Credit')
    plt.savefig("{}_quan_target".format(col))

In [None]:
for col in df.select_dtypes('object'):
    plt.figure(figsize=(5,5))
    sns.countplot(data=df, x=df[col], hue='Credit')
    plt.savefig("{}_qual_target".format(col))

# ACM ACP

In [None]:
import prince 
from sklearn.compose import make_column_selector, make_column_transformer

In [None]:
mca = prince.MCA(
    n_components=2,
    n_iter=3,
    copy=True,   
    random_state=0,
    engine='auto'
)   

pca = prince.PCA(
    n_components=6,
    n_iter=3,
    rescale_with_mean=True, 
    rescale_with_std=True,
    random_state=0,
    engine='auto'
 
)

In [None]:
categorical_features = make_column_selector(dtype_exclude=np.number)
numerical_features   = make_column_selector(dtype_include=np.number)

X_MCA = df[categorical_features]
X_PCA = df[numerical_features]

y = df["Credit"]

In [None]:
mca = mca.fit(X_MCA)
pca = pca.fit(X_PCA)

In [None]:
mca.plot_coordinates(
     X=X_MCA,
     ax=None,
     figsize=(15,6),
     show_row_points=False,
     row_points_size=3,
     show_row_labels=False,
     show_column_points=True,
     column_points_size=30,
     show_column_labels=False,
     legend_n_cols=1
   
)

In [None]:
ax = pca.plot_row_coordinates(
     X=X_PCA,
     ax=None,
    x_component=2,
    y_component=3,
     figsize=(10,10),
     color_labels=df["Credit"],
     ellipse_outline=False,
     ellipse_fill=False,
     show_points=True,
    
)
ax.get_figure().savefig('pca_row_coordinates_23.png')

In [None]:
pca.column_correlations(X_PCA)

In [None]:
plt.figure(figsize=(5,5))
sns.clustermap(df.corr())
plt.savefig("{}matrice_corr".format(col))

# PRE-PROCESSING

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline



In [None]:
df['Credit'].replace(['Good', 'Bad'], [0, 1], inplace=True)

In [None]:
trainset, testset = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
trainset['Credit'].value_counts()

In [None]:
trainset.shape, testset.shape

In [None]:
trainset

In [None]:
def preprocessing(df):
            
    X = df.drop('Credit', axis=1)
    X = make_column_transformer((OrdinalEncoder(), categorical_features)).fit_transform(X)
    y = df['Credit']
    return X, y 
    


In [None]:
preprocessing(trainset)

In [None]:
X_train, y_train = preprocessing(trainset)

In [None]:
X_train , y_train

In [None]:
X_test, y_test = preprocessing(testset)

In [None]:
X_test, y_test

# MODELISATION ET EVALUATION

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve, GridSearchCV, RandomizedSearchCV

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [None]:
def evaluation(model, name):
    
    model.fit(X_train, y_train)
    ypred = model.predict(X_test)
    
    plt.figure(figsize=(5, 5))
    cf_matrix = confusion_matrix(y_test, ypred)
    sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True,fmt='.2%', cmap='Blues')
    plt.savefig("{}_MC".format(name))
    
    print(confusion_matrix(y_test, ypred))
    print(classification_report(y_test, ypred))
    
    N, train_score, val_score = learning_curve(model, X_train, y_train,
                                              cv=4, scoring='recall',
                                               train_sizes=np.linspace(0.1, 1, 5))
    
    
    plt.figure(figsize=(5, 5))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend() 
    plt.savefig("{}_courbe".format(name))
    
   
    
    
    
    
    

In [None]:
preprocessor = make_pipeline(PolynomialFeatures(2, include_bias=False), SelectKBest(f_classif))

In [None]:
SVM = make_pipeline(
    StandardScaler(),
    preprocessor,
    SVC(random_state=0)
)  

RF = make_pipeline(
    preprocessor,
    RandomForestClassifier(random_state=0)
)  

In [None]:
SVM, RF

In [None]:
param_grid = {
    "pipeline__selectkbest__k":np.arange(10,16),
}
grid = GridSearchCV(SVM, param_grid, cv=4, scoring='recall')

grid.fit(X_train, y_train)
    

In [None]:
grid.best_params_

In [None]:
param_grid = {
    "randomforestclassifier__max_depth": np.arange(2,14),
    "randomforestclassifier__n_estimators":np.arange(2,150),
    "randomforestclassifier__max_features": np.arange(4,20), 
    "pipeline__selectkbest__k":np.arange(10,16),
}
grid = RandomizedSearchCV(RF, param_grid, cv=4, scoring='recall')

grid.fit(X_train, y_train)

In [None]:
grid.best_estimator_

In [None]:
models = []
SVM_ = make_pipeline(
    
    PolynomialFeatures(2),
    SelectKBest(f_classif, k=10),
    SVC(random_state=0)
)  

RF_ = make_pipeline(
    PolynomialFeatures(2),
    SelectKBest(f_classif, k=11),
    RandomForestClassifier(random_state=0, max_depth=8, max_features=5, n_estimators=67)
)

models.append(('RF', RF_ ))
models.append(('SVM', SVM_))
models

In [None]:
for name, model in models:
    print(name)
    evaluation(model, name)