In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

import re  # library for regex

import warnings    # Applies a warning filter to not show warning messages.
warnings.filterwarnings("ignore")  # This prevents warning messages from appearing when you run the code.
warnings.warn("this will not show") 

plt.rcParams["figure.figsize"] = (10,6)
# Sets the plot dimensions (figure size) in the Matplotlib library.
# Here (10,6) specifies a frame size of 10 units in width and 6 units in height.

sns.set_style("whitegrid")
# Sets the grid view in the Seaborn library to a white background.

pd.set_option('display.float_format', lambda x: '%.3f' % x)
# sets the float format in the pandas library.
# Here, the display format of decimal numbers is set using the lambda function.

pd.set_option('display.max_rows', None)
# sets the display.max_rows option to None if you do not want all rows in the data frames to be displayed.

pd.set_option('display.max_columns', None)
# sets the display.max_columns option to None if you do not want all columns in the data frames to be displayed.



In [None]:
train_data= pd.read_csv("/kaggle/input/credit-score-classification/train.csv")
test_data = pd.read_csv ("/kaggle/input/credit-score-classification/test.csv")

In [None]:
print(train_data.shape)
print(test_data.shape)

# ***Dataset info***<br><br>
- ***ID:*** Unique identifier for each entry in the dataset.<br>

- ***Customer_ID:*** Identifier for each customer.<br>

- ***Month:*** Month of data collection.<br>

- ***Name:*** Name of the customer.<br>

- ***Age:*** Age of the customer.<br>

- ***SSN:*** Social Security Number of the customer.<br>

- ***Occupation:*** Occupation of the customer.<br>

- ***Annual_Income:*** Annual income of the customer.<br>

- ***Monthly_Inhand_Salary:*** Monthly salary after deductions.<br>

- ***Num_Bank_Accounts:*** Number of bank accounts the customer has.<br>

- ***Num_Credit_Card:*** Number of credit cards the customer has.<br>

- ***Interest_Rate:*** Interest rate applied on loans.<br>

- ***Num_of_Loan:*** Number of loans the customer has.<br>

- ***Type_of_Loan:*** Type of loan taken by the customer.<br>

- ***Delay_from_due_date:*** Number of days delayed from due date for payments.<br>

- ***Num_of_Delayed_Payment:*** Number of delayed payments made by the customer.<br>

- ***Changed_Credit_Limit:*** Indicates if the credit limit has been changed.<br>

- ***Num_Credit_Inquiries:*** Number of credit inquiries made by the customer.<br>

- ***Credit_Mix:*** Mix of different types of credit accounts held by the customer.<br>

- ***Outstanding_Debt:*** Amount of outstanding debt.<br>

- ***Credit_Utilization_Ratio:*** Ratio of credit used to credit available.<br>

- ***Credit_History_Age:*** Age of credit history.<br>

- ***Payment_of_Min_Amount:*** Indicates if minimum payment amount is met.<br>

- ***Total_EMI_per_month:*** Total Equated Monthly Installment (EMI) paid by the customer.<br>

- ***Amount_invested_monthly:*** Amount invested monthly by the customer.<br>

- ***Payment_Behaviour:*** Payment behavior of the customer.<br>

- ***Monthly_Balance:*** Monthly balance in the account.<br>

- ***Credit_Score:*** Target variable - credit score of the customer.<br>



### Defination of df 

In [None]:
df = train_data.copy()
df.head()

In [None]:
df.shape

In [None]:
df.columns

#### First of all, it would be useful to know how the relavant persons calculate the credit score and which parameters are included in the calculation of the credit score.
https://www.investopedia.com/ask/answers/05/creditscorecalculation.asp<br>
A credit score is designed to measure your risk as a borrower. FICO does not reveal its proprietary credit score calculator formula, but the calculation incorporates five major components, with varying levels of importance. These categories with their relative weights are<br>
- Payment history (35%)<br>
- Amount owed (30%)<br>
- Length of credit history (15%)<br>
- New credit (10%)<br>
- Credit mix (10%)<br><br>
***All of these categories are taken into account in the calculation of your overall score, which can range from 300 to 850.And I tried to associate the features in the dataset with the parameters in these 5 categories.***<br><br>
- Payment history (35%) and features in our dataset that may be related : **"Payment_of_Min_Amount","Payment_Behaviour","Num_of_Delayed_Payment",
"Delay_from_due_date","Outstanding_Debt","Amount_invested_monthly"** <br>
- Amount owed (30%) and features in our dataset that may be related:  **"Credit_Utilization_Ratio","Outstanding_Debt","Monthly_Balance","Amount_invested_monthly"**<br>
- Length of credit history (15%) and features in our dataset that may be related : ***"Credit_History_Age","Num_of_Loan"***   <br>
- New credit (10%) and features in our dataset that may be related : **"Num_Credit_Inquiries","Num_of_Loan","Payment_of_Min_Amount","Monthly_Balance","Amount_invested_monthly"**   <br>
- Credit mix (10%) and features in our dataset that may be related :**"Credit_Mix"**     <br><br>
 What Isn’t Included:<br>
 - Your credit score reflects only the information contained in your credit report. Your credit report doesn't include information like your **age**, **income**, or employment history. It also will generally not include your history with utilities such as cable and phone bills nor your rental payment history.

In [None]:
df.info()

In [None]:
# The alternative code for an overview of dataset features ;

def compr_info(df, dropna=False):
    """
    Returns a dataframe consisting of datatypes, nuniques, #s of nulls head(1), most frequent item and its frequncy,
    where the column names are indices.
    """
    dt=pd.DataFrame(df.dtypes, columns=["Type"])
    dn=pd.DataFrame(df.nunique(), columns=["Nunique"])
    nonnull=pd.DataFrame(df.isnull().sum(), columns=["#of Missing"])
    firstT=df.head(1).T.rename(columns={0:"First"})
    MostFreqI=pd.DataFrame([df[x].value_counts().head(1).index[0] for x in df.columns], columns=["MostFreqItem"],index=df.columns)
    MostFreqC=pd.DataFrame([df[x].value_counts().head(1).values[0] for x in df.columns], columns=["MostFreqCount"],index=df.columns)
    return pd.concat([dt,dn,nonnull,MostFreqI,MostFreqC,firstT],axis=1)

compr_info(df)

In [None]:
df.isna().sum()

In [None]:
 df.isna().sum().sum()

#### ***Although the columns are not NaN, there may be rows containing any other expressions that express absence.***

In [None]:
pip install colorama

In [None]:
# for checking all dataframe if there is any match with our regex pattern

pattern = re.compile(r"^[^a-zA-Z0-9 .,]+$") 

from colorama import init, Fore, Back, Style

def matched_re(df):
    total_matches = 0
    for column in df.columns:
        column_matches = 0
        matched_values = set()  
        for value in df[column]:
            if re.search(r"^[^a-zA-Z0-9 .,]+$", str(value)):
                column_matches += 1
                matched_values.add(value)
                total_matches += 1
        if column_matches > 0:
            print(f"{Back.YELLOW + Fore.BLACK} Matched values in column {column}: {column_matches} times '{', '.join(matched_values)}' {Style.RESET_ALL}")
            
        else:
            print(f"Total matches in column {column}: {column_matches}")
    print(f"{Back.YELLOW + Fore.BLACK}Total matches in dataframe: {total_matches} {Style.RESET_ALL}")

In [None]:
matched_re(df)

In [None]:
df.isna().sum().sum() 

In [None]:
60071+29348

#### *The total number of nulls is 60071 + 29348 = 89419 so many that we have a problem to overcome,first I will replace the values that are not entered and appear as underscores and similar symbols with NaN to make them easier to use.*

In [None]:
df.loc[(df["Changed_Credit_Limit"].str.contains(pattern,regex=True)==True)].head(3)

In [None]:
df.loc[(df["Changed_Credit_Limit"].str.contains(pattern,regex=True)==True),"Changed_Credit_Limit"] = np.nan

In [None]:
df.loc[(df["Changed_Credit_Limit"].str.contains(pattern,regex=True)==True),"Changed_Credit_Limit"].count()

In [None]:
df.Occupation.value_counts(dropna=False)

In [None]:
# Again for checking if the created pattern matches in only Occupation feature

for text in df.Occupation.value_counts().index:
    if re.search(pattern, text):
        print("Matched value:", text)
    else:
        print("Unmatched value:", text)

In [None]:
df['Occupation'] = df['Occupation'].replace(pattern,np.nan, regex=True)

In [None]:
df.Occupation.value_counts(dropna=False)

In [None]:
df['Credit_Mix'].value_counts(dropna=False)

In [None]:
df['Credit_Mix'] = df['Credit_Mix'].replace(pattern,np.nan, regex=True)

In [None]:
df['Credit_Mix'].value_counts(dropna=False)

In [None]:
matched_re(df) # let's double check all dataframe for empty value

In [None]:
df.isna().sum()

In [None]:
df.columns = df.columns.str.lower()
df.columns

In [None]:
df.shape

In [None]:
null_columns = df.columns[df.isnull().any()]
df[null_columns].isnull().sum().sort_values(ascending=False)

In [None]:
df["credit_mix"].value_counts(dropna=False)

In [None]:
df[df["credit_mix"].isna()][null_columns].isna().sum().sort_values(ascending=False)

#### *Based on the above output, we can say that the values in 10 columns are missing for 236 rows in the data set.*

#### ***The codes below are to investigate whether I can benefit from the information in other columns on how to fill in the missing data.***

In [None]:
indices = df[null_columns].isnull().sum().sort_values(ascending=False).index

In [None]:
col_total = df[null_columns].isnull().sum().sort_values(ascending=False).values.tolist()
based_on_cr_mix =df[df["credit_mix"].isna()][null_columns].isna().sum().sort_values(ascending=False).values.tolist()
rate = []
for i,j in zip(col_total,based_on_cr_mix):
    rate_of_missing = j/i
    rate.append(rate_of_missing)
missing_data = pd.DataFrame({"total_null_count": col_total, "based_on_credit_mix": based_on_cr_mix, "rate": rate},index=indices)

missing_data

In [None]:
df["credit_mix"].isna().sum()

In [None]:
df[df["customer_id"]=="CUS_0x10fd"]

In [None]:
df["customer_id"].nunique()

#### *Yes! I can immediately access the information I want from the "customer_id" feature. There are 12500 unique customer_ids in the 100000 rows data set. Since there is no NaN value in this column, this means that  it repeats and I can fill in the NaN values in the rows with the same customer_id with those in the full rows.*

In [None]:
unique_customer_ids = df["customer_id"].unique()

In [None]:
for id in unique_customer_ids :
    df[df["customer_id"]==id] = df[df["customer_id"]==id].fillna(method='ffill').fillna(method='bfill')

In [None]:
df.isna().sum()

In [None]:
df[df["customer_id"]=="CUS_0xd40"]

In [None]:
df[df["customer_id"]=="CUS_0x93ff"]

In [None]:
df[["type_of_loan"]].sample(20)

In [None]:
# Function used to transform the "credit history age" column to simplify our dataset and use numerical data
# I changed the information given in "15 years and 5 Months" format as year and month to the month equivalent, that is, "185 Months" format.
def credit_history(duration):                        
    years, months = duration.split(' Years and ')
    months = months.split(' Months')[0]
    total_months = int(years) * 12 + int(months)
    return total_months

In [None]:
df["credit_history_age"] = df["credit_history_age"].apply(credit_history) 
df["credit_history_age"].head()

In [None]:
df.sample(5)

In [None]:
df.age = df.age.apply(lambda x : x.rstrip("_").lstrip("_"))

In [None]:
df["amount_invested_monthly"]=df["amount_invested_monthly"].apply(lambda x : x.rstrip("__").lstrip("__"))

In [None]:
df["amount_invested_monthly"]=df["amount_invested_monthly"].apply(lambda x : round(float(x),3))

In [None]:
df["monthly_balance"]=df["monthly_balance"].apply(lambda x : str(x).lstrip("__").rstrip("__"))

In [None]:
df["monthly_balance"]=df["monthly_balance"].apply(lambda x : round(float(x),3))

In [None]:
df.head()

In [None]:
compr_info(df)

### Monthly_balance feature

In [None]:
df["monthly_balance"].describe()

In [None]:
df.shape

In [None]:
q1 = 270.035
q3 = 471.606
iqr = q3 - q1 
lower_limit = q1-(2 * iqr)
top_limit = df["monthly_balance"].max()

In [None]:
lower_limit

In [None]:
top_limit

In [None]:
 df.loc[(df["monthly_balance"] < lower_limit)][["customer_id","month","monthly_balance"]]

In [None]:
df.loc[df["customer_id"]=="CUS_0x9885"][["customer_id","month","monthly_balance"]]

In [None]:
df.loc[((df["customer_id"]=="CUS_0x9885") & (df["month"]=="February")),"monthly_balance"] = df.loc[(df["customer_id"]=="CUS_0x9885")&(df["month"]!="February")]["monthly_balance"].mean()
df.loc[((df["customer_id"]=="CUS_0x5a90") & (df["month"]=="February")),"monthly_balance"] = df.loc[(df["customer_id"]=="CUS_0x5a90")&(df["month"]!="February")]["monthly_balance"].mean()
df.loc[((df["customer_id"]=="CUS_0x288d") & (df["month"]=="July")),"monthly_balance"] = df.loc[(df["customer_id"]=="CUS_0x288d")&(df["month"]!="July")]["monthly_balance"].mean()
df.loc[((df["customer_id"]=="CUS_0x85e9") & (df["month"]=="March")),"monthly_balance"] = df.loc[(df["customer_id"]=="CUS_0x85e9")&(df["month"]!="March")]["monthly_balance"].mean()
df.loc[((df["customer_id"]=="CUS_0x2b77") & (df["month"]=="July")),"monthly_balance"] = df.loc[(df["customer_id"]=="CUS_0x2b77")&(df["month"]!="July")]["monthly_balance"].mean()
df.loc[((df["customer_id"]=="CUS_0xc06e") & (df["month"]=="February")),"monthly_balance"] = df.loc[(df["customer_id"]=="CUS_0xc06e")&(df["month"]!="February")]["monthly_balance"].mean()
df.loc[((df["customer_id"]=="CUS_0x57f3") & (df["month"]=="April")),"monthly_balance"] = df.loc[(df["customer_id"]=="CUS_0x57f3")&(df["month"]!="April")]["monthly_balance"].mean()
df.loc[((df["customer_id"]=="CUS_0x41bf") & (df["month"]=="July")),"monthly_balance"] = df.loc[(df["customer_id"]=="CUS_0x41bf")&(df["month"]!="July")]["monthly_balance"].mean()
df.loc[((df["customer_id"]=="CUS_0x2f7e") & (df["month"]=="August")),"monthly_balance"] = df.loc[(df["customer_id"]=="CUS_0x2f7e")&(df["month"]!="August")]["monthly_balance"].mean()

In [None]:
 df.loc[[5545,26177,29158,35570,38622,60009,75251,82918,83255]][["customer_id","month","monthly_balance"]]

In [None]:
df["monthly_balance"].isna().sum()

In [None]:
df.isna().sum()

### Age Feature

In [None]:
df.age.astype("int").describe()

In [None]:
df.age = df.age.astype("int")

In [None]:
df.age.dtype

In [None]:
len(df.loc[df["age"] > 56])

In [None]:
len(df.loc[df["age"] < 14])

In [None]:
df.loc[df["age"] < 14].age.describe()

In [None]:
df.age.describe()

In [None]:
df.loc[df["age"] == 14 ].sample(3)

In [None]:
df.loc[df["age"] > 56].age.describe()

In [None]:
df.loc[df["age"] == 4380.000]

In [None]:
df.loc[df["customer_id"] == "CUS_0x1f2e"]

In [None]:
len(df[df["age"].apply(lambda x: len(str(int(x))) > 2)])

In [None]:
df.loc[df["age"].apply(lambda x: len(str(int(x))) > 2),"age"]=np.nan

In [None]:
df.age.isna().sum()

In [None]:
df[df["customer_id"]=="CUS_0x3861"]

In [None]:
nan_customer_ids = df.loc[df['age'].isna(), 'customer_id'].unique()

In [None]:
len(nan_customer_ids)

In [None]:
for customer_id in nan_customer_ids:
    
        right_age = df.loc[(df['customer_id'] == customer_id) & (~df['age'].isna()), 'age'].iloc[0]
        df.loc[(df['customer_id'] == customer_id) & (df['age'].isna()), 'age'] = right_age

In [None]:
df.age.isna().sum()

In [None]:
df.age.describe()

In [None]:
age_max_idx = df.loc[df["age"]>56].index
age_max_idx

In [None]:
df.drop(age_max_idx,inplace=True)

In [None]:
df.age.describe()

### Payment Behaviour feature

In [None]:
df["payment_behaviour"].value_counts()

In [None]:
pattern = re.compile(r"\W") 

In [None]:
# for checking all dataframe if there is any match with our regex pattern


from colorama import init, Fore, Back, Style

def matched_re(df):
    total_matches = 0
    for column in df.columns:
        column_matches = 0
        matched_values = set()  
        for value in df[column]:
            if re.search(r"^[^a-zA-Z0-9 .,]+$", str(value)):
                column_matches += 1
                matched_values.add(value)
                total_matches += 1
        if column_matches > 0:
            print(f"{Back.YELLOW + Fore.BLACK} Matched values in column {column}: {column_matches} times '{', '.join(matched_values)}' {Style.RESET_ALL}")
            
        else:
            print(f"Total matches in column {column}: {column_matches}")
    print(f"{Back.YELLOW + Fore.BLACK}Total matches in dataframe: {total_matches} {Style.RESET_ALL}")

In [None]:
for text in df["payment_behaviour"].value_counts().index:
    if re.search(pattern, text):
        print("Matched value:", text)
    else:
        print("Unmatched value:", text)

In [None]:
df.loc[df["payment_behaviour"]=="!@9#%8"]["customer_id"].nunique()

In [None]:
df.loc[df["payment_behaviour"]=="!@9#%8"]["customer_id"].sample(3)

In [None]:
df.loc[df["customer_id"]=="CUS_0xc9f"]

In [None]:
df.loc[df["customer_id"]=="CUS_0xc9f"]["payment_behaviour"].unique()

In [None]:
df.loc[df["customer_id"]=="CUS_0xc9f"]["payment_behaviour"].value_counts()

In [None]:
df.loc[df["customer_id"]=="CUS_0xb201"]["payment_behaviour"].value_counts()

In [None]:
undetermined_idx = df.loc[df["payment_behaviour"]=="!@9#%8"]["customer_id"].unique()

In [None]:
for customer_id in undetermined_idx:
    
        customer_ids = df[(df['customer_id'] == customer_id) & (df['payment_behaviour'] != "!@9#%8")]
        freq = customer_ids['payment_behaviour'].mode()[0]
        df.loc[(df['customer_id'] == customer_id) & (df['payment_behaviour'] == "!@9#%8"), 'payment_behaviour'] = freq
              
    

#### *I want to see the results of the changes I made*

In [None]:
df.loc[df["customer_id"]=="CUS_0xc9f"]["payment_behaviour"].unique()

In [None]:
df.loc[df["customer_id"]=="CUS_0xb201"]["payment_behaviour"].value_counts()

In [None]:
df.loc[df["customer_id"]=="CUS_0xc9f"]["payment_behaviour"].value_counts()

In [None]:
df.loc[df["customer_id"]=="CUS_0x4437"]["payment_behaviour"].value_counts()

In [None]:
for text in df["payment_behaviour"].value_counts().index:
    if re.search(pattern, text):
        print("Matched value:", text)
    else:
        print("Unmatched value:", text)

### Type Of Loan Feature

In [None]:
df["type_of_loan"].isna().sum()

In [None]:
df.loc[df["occupation"]=="Writer"]["type_of_loan"].nunique()

In [None]:
df.loc[df["type_of_loan"].notna()]["num_of_loan"].value_counts()

##### ***Let's see if the value in the "num of loan" column is related to the number of elements of the list in "type of loan"***

In [None]:
df[df["num_of_loan"] == "3"]["type_of_loan"].sample(5)

In [None]:
df.loc[[53636]]["type_of_loan"].tolist()

In [None]:
df.loc[[23728]]["type_of_loan"].tolist()

#### ***As I see above, record in the "num_of_loan" feature has same value with number of element of list in "type_of_loan" feature*** 

In [None]:
df[df["num_of_loan"] == "1150"]

In [None]:
df["num_of_loan"] = df["num_of_loan"].apply(lambda x : x.rstrip("_").lstrip("-"))

In [None]:
df["num_of_loan"] = df["num_of_loan"].astype("int")

In [None]:
df.loc[df["type_of_loan"].isna()]["num_of_loan"].value_counts()

In [None]:
df.loc[df["type_of_loan"].notna(),"type_of_loan"] = df.loc[df["type_of_loan"].notna()]["type_of_loan"].apply(lambda x : x.split(","))

### filled each row with number of elemnt of list in the "Type_of_Loan" feature 

In [None]:
for row in df.index :
    if not df.loc[row].isna().any() :
        num = len(df.loc[row]["type_of_loan"]) 
        df.at[row, "num_of_loan"] = num

In [None]:
df["num_of_loan"].replace(100,0,inplace = True )

In [None]:
df.loc[(df["type_of_loan"].isna()) & (df["num_of_loan"]==0),"type_of_loan"] = "new_credit"

In [None]:
df.loc[df["type_of_loan"].notna()]["num_of_loan"].unique()

In [None]:
df.loc[df["type_of_loan"].isna()]["num_of_loan"].nunique()

In [None]:
df["type_of_loan"].isna().sum()

#### I tried to correct the 2,3,4 digit meaningless, outlier values in the "num_of_loan" column with the "type_of_loan" column, but I could not find a way for the values that appeared as NaN in the "type_of_loan" column and such outliers in the "num_of_loan" column, and there are 61 rows in the 100.000 rows data. I overlooked these rows and decided to drop it.

In [None]:
nan_idx = df.loc[df["type_of_loan"].isna()].index

In [None]:
df.drop(nan_idx,inplace =True)

In [None]:
df["num_of_loan"].unique()

In [None]:
df["type_of_loan"].isna().sum()

In [None]:
df["annual_income"] = df["annual_income"].apply(lambda x : x.rstrip("_").lstrip("_") )

In [None]:
df["num_of_delayed_payment"] = df["num_of_delayed_payment"].apply(lambda x : x.rstrip("_").lstrip("_") )

In [None]:
df["outstanding_debt"] = df["outstanding_debt"].apply(lambda x : x.rstrip("_").lstrip("_") )

In [None]:
df.drop(["id","customer_id","name","ssn","type_of_loan"],inplace =True,axis =1)

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
matched_re(df)

#### ***Last version of dataset***

In [None]:
df = pd.read_csv("/kaggle/input/last-version-of-dataset/dataset_final_30_04.csv") 

In [None]:
df.isna().sum()

In [None]:
df.drop(["id","customer_id","name","ssn","type_of_loan"],axis=1,inplace = True)

In [None]:
df.sample(10)

In [None]:
df = df.replace("_","",regex=True)

In [None]:
df.loc[[67491]]

In [None]:
df.shape

In [None]:
df.sample(10)