# ÖDEV 1: PCA yardımı ile Classification,

Bu ödevde "Credit Risk Prediction" veri setini kullanacağız. Amacımız, verinin boyut sayısını düşürerek olabildiğince yüksek accuracy değerini alabilmek. Aşağıda verinin okunma ve temizlenme kısmını hazırlayıp vereceğim. Devamında ise yapmanız gerekenler:

1. PCA kullanarak verinin boyutunu düşürmek
    * Önce explained varience ratio değerini inceleyerek veriyi kaç boyuta düşürebileceğini kontrol et.
    * Daha sonra farklı boyutlarda denemeler yaparak boyutu düşürülmüş verileri elde et.
2. Classification modellerini dene
    * Logistic Regression
    * Random Forest
    * ve eğer istersen herhangi bir modelle daha

İsteğe bağlı olarak, verinin boyutunu düşürmek için diğer yöntemleri de kullanıp en yüksek accuracy değerini almayı deneyebilirsin.

In [606]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report

In [607]:
df: pd.DataFrame = pd.read_csv('./credit_risk_dataset.csv')

In [608]:
print(df.isnull().sum())

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64


In [609]:
# Null değerleri sütun ortalaması ile dolduruyoruz
df["person_emp_length"].fillna(df["person_emp_length"].median(), inplace=True)
df["loan_int_rate"].fillna(df["loan_int_rate"].median(), inplace=True)

In [610]:
df.duplicated().sum()

165

In [611]:
df.drop_duplicates(inplace=True)

In [612]:
df.shape

(32416, 12)

In [613]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
person_age,32416.0,27.747008,6.3541,20.0,23.0,26.0,30.0,144.0
person_income,32416.0,66091.640826,62015.580269,4000.0,38542.0,55000.0,79218.0,6000000.0
person_emp_length,32416.0,4.76888,4.090411,0.0,2.0,4.0,7.0,123.0
loan_amnt,32416.0,9593.845632,6322.730241,500.0,5000.0,8000.0,12250.0,35000.0
loan_int_rate,32416.0,11.014662,3.08305,5.42,8.49,10.99,13.11,23.22
loan_status,32416.0,0.218688,0.413363,0.0,0.0,0.0,0.0,1.0
loan_percent_income,32416.0,0.17025,0.106812,0.0,0.09,0.15,0.23,0.83
cb_person_cred_hist_length,32416.0,5.811297,4.05903,2.0,3.0,4.0,8.0,30.0


In [614]:
# Outlier temizliği
df = df[df['person_age']<=100]
df = df[df['person_emp_length'] <= 60]
df = df[df['person_income']<=4e6]

In [615]:
# Kategorik verileri alıyoruz ve one hot encoding haline getiriyoruz
cat_cols = pd.DataFrame(df[df.select_dtypes(include=['object']).columns])
cat_cols.columns

Index(['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], dtype='object')

In [616]:
encoded_cat_cols = pd.get_dummies(cat_cols)
df.drop(df.select_dtypes(include=['object']).columns, axis=1,inplace=True)
df = pd.concat([df,encoded_cat_cols], axis=1)

In [617]:
X = df.drop('loan_status', axis=1).values
y = df['loan_status'].values

In [618]:
error = []
for i in range(1,26):
    pca = PCA(n_components=i)
    pca.fit_transform(X)
    error.append(1 - np.sum(pca.explained_variance_ratio_))
best_pca_dim = error.index(min(error))
print(f"Best dimension number is {best_pca_dim}", )
pca = PCA(n_components = best_pca_dim)
X_pca_best = pd.DataFrame(pca.fit_transform(X))

Best dimension number is 21


In [619]:
error

[0.012827621799866629,
 2.8997998335889008e-08,
 1.0507185677077757e-08,
 5.1205306661472605e-09,
 1.7522995277019504e-09,
 7.603012663892628e-10,
 6.150032705320996e-10,
 4.884235238478141e-10,
 4.1968417630044996e-10,
 3.516545943327287e-10,
 2.867506232462347e-10,
 2.2465795890269646e-10,
 1.65465641188689e-10,
 1.1492240492572137e-10,
 7.314471250907673e-11,
 3.604572196280742e-11,
 1.4632850486862026e-11,
 6.702638444266995e-12,
 3.962830064097034e-12,
 2.3248070135650778e-12,
 7.781553179597722e-13,
 -2.220446049250313e-16,
 -2.220446049250313e-16,
 -2.220446049250313e-16,
 -2.220446049250313e-16]

In [620]:
pca = PCA(n_components=0.99)
X_pca = pca.fit_transform(X)
print(f"{pca.components_.shape[0]} dimension is explain data more than %99 variation")

2 dimension is explain data more than %99 variation


In [621]:
# Verileri train ve test olarak ikiye ayırıyoruz

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(1, test_size=0.1)
train_idx, test_idx = next(split.split(X_pca_best, y))
x_train = X[train_idx]
x_test = X[test_idx]

y_train = y[train_idx]
y_test = y[test_idx]

In [622]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(x_train)
X_test_scaled = scaler.transform(x_test)

## Logistic Regression

In [623]:
log_reg = LogisticRegression(random_state = 42)
log_reg.fit(X_train_scaled, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [624]:
print(confusion_matrix(y_test, log_reg.predict(X_test_scaled)))

[[2425  107]
 [ 312  397]]


In [625]:
print(classification_report(y_test, log_reg.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       0.89      0.96      0.92      2532
           1       0.79      0.56      0.65       709

    accuracy                           0.87      3241
   macro avg       0.84      0.76      0.79      3241
weighted avg       0.86      0.87      0.86      3241



## Random Forest

In [626]:
rf = RandomForestClassifier(random_state = 42)
rf.fit(X_train_scaled, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [627]:
print(confusion_matrix(y_test, rf.predict(X_test_scaled)))

[[2525    7]
 [ 200  509]]


In [628]:
print(classification_report(y_test, rf.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      2532
           1       0.99      0.72      0.83       709

    accuracy                           0.94      3241
   macro avg       0.96      0.86      0.90      3241
weighted avg       0.94      0.94      0.93      3241



## Xgboost

In [635]:
#! pip install xgboost

In [636]:
import xgboost

In [637]:
xgb = xgboost.XGBClassifier()
xgb.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [638]:
print(confusion_matrix(y_test, xgb.predict(X_test_scaled)))

[[1604  928]
 [ 188  521]]


In [639]:
print(classification_report(y_test, xgb.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       0.90      0.63      0.74      2532
           1       0.36      0.73      0.48       709

    accuracy                           0.66      3241
   macro avg       0.63      0.68      0.61      3241
weighted avg       0.78      0.66      0.69      3241

