In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm

In [None]:
## Modeling Dataset 

df = pd.read_csv('Modeling dataset.csv')

df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
## dropping id column 

df.drop(columns=['id'], inplace=True)

In [None]:
## Numeric columns in dataset

import matplotlib.pyplot as plt
import seaborn as sns


numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
numeric_cols


In [None]:
# Histogram plots of the numeric columns:

df[numeric_cols].hist(bins=15, figsize=(15, 10))
plt.suptitle('Histograms of Numerical Features')
plt.show()

In [None]:
df.isnull().sum()

In [None]:
df['loan_status'].value_counts()

In [None]:
# Loan_status plot

column_name = 'loan_status' 

value_counts = df[column_name].value_counts()
value_counts.plot(kind='bar', figsize=(10, 6))
plt.title(f'Value Counts of {column_name}')
plt.xlabel(column_name)
plt.ylabel('Counts')



plt.show()


In [None]:
## On the basis of categories provided and understanding of those terms, converting 

df['target'] = df['loan_status'].apply(lambda x: 1 if x in ['Current', 'Fully Paid'] else 0)

In [None]:
df.drop(columns=['loan_status'], inplace=True)

In [None]:
df.head()

In [None]:
# Convert the 'earliest_cr_line' column to datetime
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], errors='coerce')


print(f"Data type after conversion: {df['earliest_cr_line'].dtype}")


print(df['earliest_cr_line'].head())

In [None]:
earliest_cr_line = df['earliest_cr_line']

# Finding min and max dates in dataset
min_date = earliest_cr_line.min()
max_date = earliest_cr_line.max()

print(f"Minimum date: {min_date}")
print(f"Maximum date: {max_date}")

years = earliest_cr_line.dt.year

plt.figure(figsize=(8, 4))
years.hist(bins=30, edgecolor='black')
plt.title('Distribution of Earliest Credit Line Years')
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Calculating the number of years since the earliest credit line date
from datetime import datetime
current_date = datetime.now()
df['credit_age_years'] = (current_date - df['earliest_cr_line']).dt.days / 365.25


df = df.drop(columns=['earliest_cr_line'])

# DataFrame with the new feature
df.head()


In [None]:
## PURPOSE-  one hot encoding

df['purpose'].value_counts()


In [None]:

column_name = 'purpose'  


if df[column_name].dtype == 'object':
    # For categorical columns
    value_counts = df[column_name].value_counts()
    value_counts.plot(kind='bar', figsize=(10, 6))
    plt.title(f'Value Counts of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Counts')
else:
    # For numeric columns
    df[column_name].plot(kind='bar', figsize=(10, 6))
    plt.title(f'Bar Graph of {column_name}')
    plt.xlabel('Index')
    plt.ylabel(column_name)


plt.show()


In [None]:
# Create a mapping dictionary according to categories in form
purpose_mapping = {
    'debt_consolidation': 'debt',
    'credit_card': 'debt',
    'other': 'personal',
    'home_improvement': 'home loan',
    'small_business': 'personal',
    'major_purchase': 'personal',
    'car': 'personal',
    'wedding': 'personal',
    'medical': 'personal',
    'house': 'home loan',
    'moving': 'personal',
    'vacation': 'personal',
    'educational': 'education loan',
    'renewable_energy': 'home loan'
}


df['broad_purpose'] = df['purpose'].map(purpose_mapping)
df['broad_purpose'].value_counts()
df.drop(columns=['purpose'], inplace=True)

In [None]:
df.head()

In [None]:
# Performing one-hot encoding on the 'purpose' column

df = pd.get_dummies(df, columns=['broad_purpose'], prefix='broad_purpose', dtype=int)


df.head()


In [None]:
## DESCRIPTION - extracing some meaningful keywords ['borrow','credit','debt'] from the text and using them as new features

df['desc'].isnull().sum()

In [None]:
# text preprocessing for desc

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')



# Replacing NaN values in 'description' column with an empty string
df['desc_new'] = df['desc'].fillna('')

# function to preprocess text
def preprocess_text(text):
    if isinstance(text, str):  # Check if the text is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'<br/>', ' ', text)  # Replace HTML line breaks with spaces
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        tokens = word_tokenize(text)  # Tokenize text
        tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing to the 'description' column
df['cleaned_description'] = df['desc_new'].apply(preprocess_text)



In [None]:
pd.pandas.set_option('display.max_columns',None)

In [None]:
# Define the predefined words
predefined_words = ['borrow', 'credit', 'debt']

# Create a function to extract features for predefined words
def extract_features(tokens, words):
    features = {}
    for word in words:
        features[word] = int(word in tokens)
    return features

# Apply the feature extraction
df_features = df['cleaned_description'].apply(lambda x: extract_features(x, predefined_words))

# Convert the features to a DataFrame
features_df = pd.DataFrame(list(df_features))

# Concatenate with the original dataframe
df_final = pd.concat([df, features_df], axis=1)


In [None]:
df_final

In [None]:
df_final.drop(columns=['desc'], inplace=True)
df_final.drop(columns=['desc_new'], inplace=True)
df_final.drop(columns=['cleaned_description'], inplace=True)

df_final

In [None]:
## Adding debt_to_income_ratio as a new feature

df_final['debt_to_income_ratio'] = df_final['loan_amnt'] / df_final['annual_inc']

In [None]:
df_final.head()

In [None]:
df_final.info()

In [None]:
df_final.shape

In [None]:
column_name = 'target' 


value_counts = df[column_name].value_counts()
value_counts.plot(kind='bar', figsize=(6, 4))
plt.title(f'Value Counts of {column_name}')
plt.xlabel(column_name)
plt.ylabel('Counts')

plt.show()

target_column = 'target'

# Getting the value counts of 0 and 1 in the target column
value_counts = df_final[target_column].value_counts()

# Calculating the total number of rows
total_count = len(df)

# Calculating the percentages
percentage_0 = (value_counts[0] / total_count) * 100
percentage_1 = (value_counts[1] / total_count) * 100

#  values and percentages
print(f"Number of 1s: {value_counts[1]} ({percentage_1:.2f}%)")
print(f"Number of 0s: {value_counts[0]} ({percentage_0:.2f}%)")



In [None]:
d1=df_final.corr()
d1

In [None]:
## Plotting the heatmap

plt.figure(figsize=(25,25))
sns.heatmap(data=d1,yticklabels=True,cbar=True,annot=True,cmap='viridis')


In [None]:
## Libraries used

import pandas as pd
import numpy as np
import json
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline

In [None]:
# Custom identity transformer for binary features

def identity_transform(x):
    return x
    
target_column = 'target'
X = df_final.drop(columns=[target_column])
y = df_final[target_column]


In [None]:
df_final.head()

In [None]:
# specific features for scaling (except binary)

features_for_scaling = ['loan_amnt', 'emp_length', 'annual_inc', 'delinq_2yrs', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'credit_age_years', 'debt_to_income_ratio']  # replace with your actual feature names
binary_features = [col for col in X.columns if col not in features_for_scaling]


In [None]:
# Splitting the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# ColumnTransformer that only scales specified features

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features_for_scaling),
        ('bin', FunctionTransformer(identity_transform, validate=False), binary_features )# ye line hata k bhi same aana chahiye
    ],
    remainder='passthrough'
)

In [None]:
# pipeline with SMOTETomek, ColumnTransformer, and LogisticRegression

pipeline = Pipeline([
    ('smotetomek', SMOTETomek(sampling_strategy='auto', random_state=42)),
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(penalty='l2', solver='liblinear'))
])

# Define parameter grid for GridSearch
param_grid = {
    'logreg__C': [0.01, 0.1, 1, 10, 100]
}

In [None]:
# USING GridSearchCV to find the best hyperparameters

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)


In [None]:
# Extracting the best parameters from GridSearch

print("Best parameters:", grid_search.best_params_)


In [None]:
# Evaluating the model

# extracting the test data
y_pred = grid_search.predict(X_test)

f1 = f1_score(y_test, y_pred, average='binary')
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)

In [None]:
# Accessing the best logistic regression model i.e. with logreg_C=100 as calculated above

best_logreg = grid_search.best_estimator_.named_steps['logreg']

# Getting the coefficients and intercept

coefficients = best_logreg.coef_[0]
intercept = best_logreg.intercept_[0]


In [None]:


preprocessor = grid_search.best_estimator_.named_steps['preprocessor']
scaled_feature_names = preprocessor.transformers_[0][2]  # Features that were scaled
binary_feature_names = preprocessor.transformers_[1][2]  # Features that were not scaled

# Combine feature names in the same order they were processed by ColumnTransformer

feature_names = list(scaled_feature_names) + list(binary_feature_names)
coef_dict = dict(zip(feature_names, coefficients))


In [None]:
## viewing the coefficients obtained

print(f"Intercept: {intercept}")
print("Coefficients:")
for feature, coef in coef_dict.items():
    print(f"{feature}: {coef}")

# Saving the weights to a JSON file for it to be used at backend
weights = {
    'intercept': intercept,
    'coefficients': coef_dict
}

weights_file = 'model_weights_oversampler_new.json'
with open(weights_file, 'w') as f:
    json.dump(weights, f)

print(f"Weights saved to {weights_file}")


In [None]:
# Saving the model

model_file = 'credit_score_model_oversampler_new2.pkl'
joblib.dump(grid_search.best_estimator_, model_file)
print(f"Model saved to {model_file}")


# Saving the scaling parameters, again to be used at backend

scaler = preprocessor.named_transformers_['num']
scaler_params = {
    'name': 'StandardScaler',
    'features': {
        feature: {
            'mean': scaler.mean_[i],
            'scale': scaler.scale_[i]
        }
        for i, feature in enumerate(features_for_scaling)
    }
}

scaler_params_file = 'scaler_params_new_2_final.json'
with open(scaler_params_file, 'w') as f:
    json.dump(scaler_params, f)

print(f"Scaler parameters saved to {scaler_params_file}")

# Print the names of the scaled features
print("Scaled features:", features_for_scaling)

In [None]:
## Examining the features after standard scaling 

X_train_scaled = preprocessor.fit_transform(X_train)
print("Samples after scaling:")
print(pd.DataFrame(X_train_scaled, columns=feature_names).head())

In [None]:
# THRESHOLD TUNING

import numpy as np
from sklearn.metrics import classification_report

# Predict probabilities
y_probs = best_logreg.predict_proba(X_test)[:, 1]

# Define a range of thresholds to test
thresholds = np.linspace(0.1, 0.9, 9)

# Store performance metrics for each threshold
results = []

for threshold in thresholds:
    # Apply the threshold to make binary predictions
    y_decision = (y_probs >= threshold).astype(int)

    # Evaluate the performance
    report = classification_report(y_test, y_decision, output_dict=True)
    results.append({
        'threshold': threshold,
        'precision_1': report['1']['precision'],
        'recall_1': report['1']['recall'],
        'f1-score_1': report['1']['f1-score'],
        'support_1': report['1']['support']
    })

# Convert results to a DataFrame for easier analysis
import pandas as pd
results_df = pd.DataFrame(results)

# Print the results
print(results_df)

# Find the best threshold based on the highest F1 score for the minority class
best_threshold = results_df.loc[results_df['f1-score_1'].idxmax(), 'threshold']
print(f'Best threshold: {best_threshold}')

# Reapply the best threshold to make final predictions
y_decision_best = (y_probs >= best_threshold).astype(int)

# Evaluate the model with the best threshold
print(classification_report(y_test, y_decision_best))


In [None]:
import joblib
import json
from sklearn.metrics import f1_score, confusion_matrix, classification_report

# Load the saved model
model = joblib.load('credit_score_model_oversampler_new2.pkl')

# Predict probabilities on the test data using the loaded model
# Note: You need to have X_test and y_test available or reload them if needed
y_proba_loaded = model.predict_proba(X_test)[:, 1]

# Function to generate credit score based on prediction probability
def generate_credit_score(proba):
    return proba * 1000  # Simple example: scale probability to a score out of 1000

# Generate credit scores for the test set
credit_scores = generate_credit_score(y_proba_loaded)

# Set the credit score thresholds for loan approval
credit_score_thresholds = np.linspace(100, 900, 40)

# Calculate and print the metrics for each threshold
print("Metrics for different credit score thresholds:")

for threshold in credit_score_thresholds:
    # Predict loan approval based on the credit score threshold
    loan_approval_predictions = credit_scores >= threshold
    
    # Calculate the F1 scores for the current threshold
    f1 = f1_score(y_test, loan_approval_predictions, average='binary')
    f1_per_class = f1_score(y_test, loan_approval_predictions, average=None)
    
    # Calculate the confusion matrix for the current threshold
    conf_matrix = confusion_matrix(y_test, loan_approval_predictions)
    
    # Generate the classification report for detailed metrics
    #report = classification_report(y_test, loan_approval_predictions, target_names=['Rejected', 'Approved'], output_dict=True)
    
    # Print the metrics for the current threshold
    print(f"\nThreshold = {threshold}")
    print(f"F1 Score (overall) = {f1:.4f}")
    print(f"F1 Score (class 0) = {f1_per_class[0]:.4f}")
    print(f"F1 Score (class 1) = {f1_per_class[1]:.4f}")
    #print(f"Confusion Matrix:\n{conf_matrix}")
    #print(f"Classification Report:\n{report}")
