# Credit Score Classification :  EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from dataprep.datasets import load_dataset
from dataprep.eda import create_report

In [None]:
url="https://drive.google.com/file/d/188k4zYkpSu3S1xD_r6ztc33RCLINl8-k/view?usp=sharing"
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)

In [None]:
df.head(20)

In [None]:
df['Credit_Score'].value_counts()

In [None]:
df['Customer_ID'].value_counts()

In [None]:
df['ID'].value_counts()

In [None]:
df['SSN'].value_counts()

In [None]:
create_report(df).show()

In [None]:
corr = df.corr()
f, ax = plt.subplots(figsize=(12, 10))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

# Task 3: Preprocessing

- remove unnecessary columns
- Categorical to numerical
- Missing values -  Function to relate customer  or KNN imputer - ID with each column
- remove outliers and noise
- skewness
`- return back to Type of loans column`


In [None]:
df.head(20)

In [None]:
#df["Type_of_Loan"].value_counts()

In [None]:
url="https://drive.google.com/file/d/188k4zYkpSu3S1xD_r6ztc33RCLINl8-k/view?usp=sharing"
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)

In [None]:
#df["Changed_Credit_Limit"].value_counts

In [None]:
df.columns

In [None]:
df["Payment_of_Min_Amount"].value_counts

In [None]:
def preprocessing(df):# Here we preprocessed the entire dataset
    
    # Here we preprocessde the Type of Loan , as this columns had many problems
    df = df.assign(Type_of_Loan = df["Type_of_Loan"].str.replace(r" and "," "))
    df['Type_of_Loan']=df['Type_of_Loan'].replace(np.nan, 'Not Specified', regex=True)
    d=df['Type_of_Loan'].str.split(", ").explode(['Type_of_Loan'])
    types_of_loan=set(d)
    location_df = df['Type_of_Loan'].apply(lambda x: pd.Series(str(x).split(", ")))
    for i in range(9):
        location_df[i].replace(np.nan, '0', regex=True,inplace=True)
    number_of_diff_loans=len(types_of_loan)
    dic_of_loans=dict(zip(types_of_loan,range(1,number_of_diff_loans+1)))
    for i in range(9):
        location_df[i]=location_df[i].map(dic_of_loans)
    for i in range(9):
        location_df[i].replace(np.nan, '0', regex=True,inplace=True)
    location_df=location_df.astype("uint8")

    df.drop(['Type_of_Loan'],inplace = True, axis = 1)

    df=pd.concat([df, location_df], axis=1, join='inner')
    # end of the Type of loan preprocessing column
    #-------------------------------------------------------------
    #rest of preprocessing columns
    df.drop(['ID'],inplace = True, axis = 1)
    df.drop(['Name'],inplace = True, axis = 1)
    df.drop(['SSN'],inplace = True, axis = 1)
    df['Age'] = df['Age'].str.replace(r'\D+', '', regex=True).astype('int')
    df['Annual_Income'] = df['Annual_Income'].str.replace(r'\D+', '', regex=True).astype('float')
    df['Num_of_Loan'] = df['Num_of_Loan'].str.replace(r'\D+', '', regex=True).astype('int')
    df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].str.replace(r'\D+', '', regex=True).astype('float')
    df['Changed_Credit_Limit'] = pd.to_numeric(df['Changed_Credit_Limit'].str.replace('_', '-4.66'))
    df['Outstanding_Debt'] = pd.to_numeric(df['Outstanding_Debt'].str.replace('_', ''))
    df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].str.replace('NM', 'No')
    df['Amount_invested_monthly'] = pd.to_numeric(df['Amount_invested_monthly'].str.replace('_', ''))
    df['Monthly_Balance'] = pd.to_numeric(df['Monthly_Balance'].str.replace('__-333333333333333333333333333__', '333.3333'))
    credit_age = []
    for i in df['Credit_History_Age']:
        credit_age.append(str(i).split(' ')[0])
    df['Credit_History_Age'] = credit_age
    df['Credit_History_Age'] = df['Credit_History_Age'].replace({'nan':np.nan})
    df['Credit_History_Age'] = df['Credit_History_Age'].astype('float64')
    df['Age'] = df['Age'].replace('-', '')
    df['Changed_Credit_Limit'] = df['Changed_Credit_Limit'].replace('-', '')
    df['Num_Bank_Accounts'] = df['Num_Bank_Accounts'].replace('-', '').astype("uint8")
    df['Changed_Credit_Limit'] = (df['Changed_Credit_Limit'].replace('-', '')).astype("uint8")
    df["Changed_Credit_Limit"]=abs(df["Changed_Credit_Limit"])
    #-----------------------------------------------------------------------
    return df

In [None]:
df=preprocessing(df)

In [None]:
dict = {0: 'Loan1',
        1: 'Loan2',
        2: 'Loan3',
        3: 'Loan4',
        4: 'Loan5',
        5: 'Loan6',
        6: 'Loan7',
        7: 'Loan8',
        8: 'Loan9'}
 
# call rename () method
df.rename(columns=dict,
          inplace=True)

In [None]:
CatCols = ["Customer_ID", "Month", "Occupation", "Credit_Mix", "Payment_of_Min_Amount"
           , "Payment_Behaviour","Credit_Score"]

In [None]:
dfCat = df[CatCols]

In [None]:
numCols = ['Age',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Monthly_Balance', 'Loan1', 'Loan2', 'Loan3', 'Loan4', 'Loan5', 'Loan6',
       'Loan7', 'Loan8', 'Loan9']

In [None]:
dfNum = df[numCols]

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
imputed = imputer.fit_transform(dfNum)
df_imputed = pd.DataFrame(imputed, columns=dfNum.columns)


In [None]:
df.info()

In [None]:
df.columns

In [None]:
df=pd.concat([df_imputed, dfCat], axis=1, join='inner')

In [None]:
df["Monthly_Balance"].min()

In [None]:
df["Credit_History_Age"].value_counts()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.head()

In [None]:
df.to_csv('fixed.csv', index=False)

# Importing new dataset:


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#from dataprep.datasets import load_dataset
#from dataprep.eda import create_report

In [None]:
""""
url="https://drive.google.com/file/d/1w_nuEXv8zL8xI7zn2I1auPDw6q35uHmF/view?usp=sharing"
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
df = pd.read_csv(url)
"""

In [None]:
#df=pd.read_csv(r'C:\Users\Karim-\Desktop\Machinefy Internship\fixed.csv')

In [None]:
df.head()

In [None]:
df['Credit_Score'].value_counts()

Standard = 2
Poor = 1
Good = 0

# Outliers:

In [None]:
df['Changed_Credit_Limit'].value_counts

In [None]:
#!pip install plotly

In [None]:
import plotly.express as px

In [None]:
def outlier_correction(df,column_name):
    q1, q3= np.percentile(df[column_name],[25,75])
    iqr = q3 - q1
    lower_bound = q1 -(1.5 * iqr) 
    upper_bound = q3 +(1.5 * iqr) 
    df[column_name][df[column_name] < lower_bound ] = lower_bound
    df[column_name][df[column_name] > upper_bound ] = upper_bound

In [None]:
def outliers_correction_and_scaling(df):
    lower_bound_age = 14
    upper_bound_age = 85
    #print(lower_bound_age, upper_bound_age )

    df.Age[df["Age"] < lower_bound_age ] = lower_bound_age

    df.Age[df["Age"] > upper_bound_age ] = upper_bound_age
    outlier_columns=['Annual_Income','Num_Bank_Accounts','Num_Credit_Card','Interest_Rate',
                     'Num_of_Loan','Num_of_Delayed_Payment',
                     'Changed_Credit_Limit','Num_Credit_Inquiries','Amount_invested_monthly',"Total_EMI_per_month"]
    for i in outlier_columns:
        outlier_correction(df,i)
    
    from sklearn.preprocessing import RobustScaler

    transformer = RobustScaler().fit(df)
    transformer.transform(df)
    return df

In [None]:
df=outliers_correction_and_scaling(df)

In [None]:
numCols = ['Age',
       'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Monthly_Balance', 'Loan1', 'Loan2', 'Loan3', 'Loan4']

In [None]:
dfNum = df[numCols]

In [None]:
from sklearn.preprocessing import RobustScaler

transformer = RobustScaler().fit(dfNum)
transformer.transform(dfNum)

In [None]:
df = pd.concat([pd.DataFrame(dfNum, columns = df[numCols].columns), df[CatCols]])

In [None]:
df.head()

In [None]:
df = pd.read_csv("/Users/omniaelmenshawy/Desktop//a7lafinal.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df = df[['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
       'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly',
       'Monthly_Balance', 'Loan1', 'Loan2', 'Loan3', 'Loan4', 'Customer_ID', 'Month', 'Occupation',
       'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour',
       'Credit_Score']]

# Modelling:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.utils import resample
import seaborn as sns

In [None]:
df.head()

In [None]:
df.columns

In [None]:
sns.set(rc={'figure.figsize':(30,20)})
sns.heatmap(df.corr(),annot=True,fmt='0.2g',cmap='coolwarm')

In [None]:
#! pip install shap
#! pip install datasets

In [None]:
import shap
from datasets import load_dataset

In [None]:
df_testing=df.copy(deep=True)

from sklearn.utils import shuffle

df2=shuffle(df_testing[df_testing['Credit_Score']== "Standard"][:25000])

df1=shuffle(df_testing[df_testing['Credit_Score']=="Poor"][:20000])

df0=shuffle(df_testing[df_testing['Credit_Score']=="Good"])

df_final=pd.concat([df2, df1,df0])

df_final=df_final.reset_index(drop=True)

In [None]:
df_final.replace('Standard'  ,2,inplace=True)
df_final.replace('Poor',1,inplace=True)
df_final.replace('Good'   ,0,inplace=True)

In [None]:
y_train = df_final.pop("Credit_Score")
x_train = df_final
categorical_columns = ["Customer_ID", "Month", "Occupation", "Credit_Mix", "Payment_of_Min_Amount"
           , "Payment_Behaviour",
]
target = ["Credit_Score",]
#y_train = y_train.astype({col: "category" for col in target})
#y_train = (y_train == "Good").astype(int)
#y_train = (y_train == "Poor").astype(int)
#y_train = (y_train == "Standard").astype(int)
x_train = x_train.astype({col: "category" for col in categorical_columns})


data = xgb.DMatrix(x_train, label=y_train, enable_categorical=True)
model = xgb.train(params={"objective": "multi:softmax", 'num_class': 3}, dtrain=data)
explainer = shap.TreeExplainer(model)

In [None]:
x_train.columns

In [None]:
df["Loan2"].min()

In [None]:
df["Loan2"].max()

In [None]:
y_train.unique()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_train,y_train, test_size = 0.2, random_state=44)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_test.shape

In [None]:
pos_pred.shape

## Logistic:

In [None]:
classifier = LogisticRegression(random_state=0)

In [None]:
classifier.fit(x_train, y_train)

In [None]:
print('the score on train dataset is') 
print(classifier.score(x_train, y_train))

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
print("Test Accuracy : ", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred),annot=True,fmt='',cmap='YlGnBu')

## KNN:

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(x_train, y_train)

In [None]:
print('the score on train dataset is') 
print(knn_classifier.score(x_train, y_train))

In [None]:
y_pred = knn_classifier.predict(x_test)

In [None]:
print("Test Accuracy : ", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred),annot=True,fmt='',cmap='YlGnBu')

## GaussianNB:

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

In [None]:
print('the score on train dataset is') 
print(classifier.score(x_train, y_train))

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
print("Test Accuracy : ", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred),annot=True,fmt='',cmap='YlGnBu')

# SVC:

In [None]:
from sklearn.svm import SVC
classifier = SVC()
classifier.fit(x_train, y_train)

In [None]:
print('the score on train dataset is') 
print(classifier.score(x_train, y_train))

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
print("Test Accuracy : ", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred),annot=True,fmt='',cmap='YlGnBu')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
classifier = GradientBoostingClassifier()
classifier.fit(x_train, y_train)

In [None]:
print('the score on train dataset is') 
print(classifier.score(x_train, y_train))

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
print("Test Accuracy : ", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred),annot=True,fmt='',cmap='YlGnBu')

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(max_depth=15, random_state=17)
clf.fit(x_train, y_train)

In [None]:
print('the score on train dataset is') 
print(clf.score(x_train, y_train))

In [None]:
y_pred = clf.predict(x_test)

In [None]:
print("Test Accuracy : ", accuracy_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
sns.set(rc={'figure.figsize':(16,8)})
sns.heatmap(confusion_matrix(y_test, y_pred),annot=True,fmt='',cmap='YlGnBu')

In [None]:
df['Credit_Score'].value_counts()

In [None]:
df.shape

In [None]:
df_final

# Now we will import the Test dataset!

In [None]:
df_test=pd.read_csv(r"C:\Users\Karim-\Desktop\Machinefy Internship\test.csv")

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
def preprocessing_test(df):# Here we preprocessed the entire dataset
    
    # Here we preprocessde the Type of Loan , as this columns had many problems
    df = df.assign(Type_of_Loan = df["Type_of_Loan"].str.replace(r" and "," "))
    df['Type_of_Loan']=df['Type_of_Loan'].replace(np.nan, 'Not Specified', regex=True)
    d=df['Type_of_Loan'].str.split(", ").explode(['Type_of_Loan'])
    types_of_loan=set(d)
    location_df = df['Type_of_Loan'].apply(lambda x: pd.Series(str(x).split(", ")))
    for i in range(9):
        location_df[i].replace(np.nan, '0', regex=True,inplace=True)
    number_of_diff_loans=len(types_of_loan)
    dic_of_loans=dict(zip(types_of_loan,range(1,number_of_diff_loans+1)))
    for i in range(9):
        location_df[i]=location_df[i].map(dic_of_loans)
    for i in range(9):
        location_df[i].replace(np.nan, '0', regex=True,inplace=True)
    location_df=location_df.astype("uint8")

    df.drop(['Type_of_Loan'],inplace = True, axis = 1)

    df=pd.concat([df, location_df], axis=1, join='inner')
    # end of the Type of loan preprocessing column
    #-------------------------------------------------------------
    #rest of preprocessing columns
    df.drop(['ID'],inplace = True, axis = 1)
    df.drop(['Name'],inplace = True, axis = 1)
    df.drop(['SSN'],inplace = True, axis = 1)
    df['Age'] = df['Age'].str.replace(r'\D+', '', regex=True).astype('int')
    df['Annual_Income'] = df['Annual_Income'].str.replace(r'\D+', '', regex=True).astype('float')
    df['Num_of_Loan'] = df['Num_of_Loan'].str.replace(r'\D+', '', regex=True).astype('int')
    df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].str.replace(r'\D+', '', regex=True).astype('float')
    df['Changed_Credit_Limit'] = pd.to_numeric(df['Changed_Credit_Limit'].str.replace('_', '-4.66'))
    df['Outstanding_Debt'] = pd.to_numeric(df['Outstanding_Debt'].str.replace('_', ''))
    df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].str.replace('NM', 'No')
    df['Amount_invested_monthly'] = pd.to_numeric(df['Amount_invested_monthly'].str.replace('_', ''))
    df['Monthly_Balance'] = pd.to_numeric(df['Monthly_Balance'].str.replace('__-333333333333333333333333333__', '333.3333'))
    credit_age = []
    for i in df['Credit_History_Age']:
        credit_age.append(str(i).split(' ')[0])
    df['Credit_History_Age'] = credit_age
    df['Credit_History_Age'] = df['Credit_History_Age'].replace({'nan':np.nan})
    df['Credit_History_Age'] = df['Credit_History_Age'].astype('float64')
    df['Age'] = df['Age'].replace('-', '')
    df['Changed_Credit_Limit'] = df['Changed_Credit_Limit'].replace('-', '')
    df['Num_Bank_Accounts'] = df['Num_Bank_Accounts'].replace('-', '').astype("uint8")
    df['Changed_Credit_Limit'] = (df['Changed_Credit_Limit'].replace('-', '')).astype("uint8")
    df["Changed_Credit_Limit"]=abs(df["Changed_Credit_Limit"])
    #-----------------------------------------------------------------------
    #Here we changed categorical data to numerical 
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    df['Customer_ID'] = le.fit_transform(df['Customer_ID'])
    df['Month'] = le.fit_transform(df.Month)
    df['Occupation'] = le.fit_transform(df.Occupation)
    df['Credit_Mix'] = le.fit_transform(df['Credit_Mix'])
    df['Payment_of_Min_Amount'] = le.fit_transform(df["Payment_of_Min_Amount"])
    df['Payment_Behaviour'] = le.fit_transform(df['Payment_Behaviour'])
    #-----------------------------------------------------------------------
    # Now will begin the filling nan values with KNN imputer
    from sklearn.impute import KNNImputer
    imputer = KNNImputer(n_neighbors=5)
    imputed = imputer.fit_transform(df)
    df_imputed = pd.DataFrame(imputed, columns=df.columns)
    return df_imputed

In [None]:
df_final_test=preprocessing_test(df_test)

In [None]:
df_final_test.head()

In [None]:
df_final_test=outliers_correction_and_scaling(df_final_test)

In [None]:
df_final_test.head()

In [None]:
X_test=x = df_final_test[[ 'Annual_Income',
       'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
       'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
       'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Payment_Behaviour','Credit_Mix', 'Outstanding_Debt', 'Credit_History_Age',
       'Payment_of_Min_Amount', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Monthly_Balance',0, 1, 2, 3]]

In [None]:
y_pred=knn_classifier.predict(X_test)

In [None]:
y_pred.shape