# Loan Prediction

### Developed by:

1. Tiago Pinheiro - 202205295
2. Tiago Rocha    - 202005428
3. Vasco Melo     - 202207564

In [None]:
#load dataset
import pandas as pd


dataset = pd.read_csv('data/credit_risk_dataset.csv')

dataset.head()

In [None]:
removed_age_entries = dataset[dataset['person_age'] > 120]
print("Entries with person_age > 120:")
print(removed_age_entries)

# Find entries where person_emp_length > person_age
removed_emp_length_entries = dataset[dataset['person_emp_length'] > dataset['person_age']]
print("\nEntries with person_emp_length > person_age:")
print(removed_emp_length_entries)

# Combine all removed entries for reference
all_removed_entries = pd.concat([removed_age_entries, removed_emp_length_entries]).drop_duplicates()
print("\nAll entries to be removed:")
print(all_removed_entries)

# Remove invalid entries from the dataset
dataset = dataset[dataset['person_age'] <= 120]
dataset = dataset[dataset['person_emp_length'] <= dataset['person_age']]

# Display the updated dataset
print("\nDataset after removing invalid entries:")
print(dataset.describe())

In [None]:
# Remove invalid entries from the dataset
dataset = dataset[dataset['person_age'] <= 120]
dataset = dataset[dataset['person_emp_length'] <= dataset['person_age']]

# Display the updated dataset
print("\nDataset after removing invalid entries:")
display(dataset.describe())

In [None]:
# Find incomplete data (missing values)
print("Incomplete data (missing values) in the dataset:")

# Check for missing values in each column
missing_data = dataset.isnull().sum()

# Display columns with missing values
missing_data = missing_data[missing_data > 0]
if not missing_data.empty:
    print(missing_data)
else:
    print("No missing values found in the dataset.")

In [None]:
# Remove rows with missing values in the dataset
dataset = dataset.dropna()

# Verify that there are no more missing values
print("Dataset after removing rows with missing values:")
print(dataset.isnull().sum())

In [None]:
# Map to convert 'person_home_ownership' to numeric values
home_ownership_map = {
    'MORTGAGE': 0,
    'RENT': 1,
    'OWN': 2,
    'OTHER': 3
}

# Apply the mapping to the 'person_home_ownership' column
dataset['person_home_ownership'] = dataset['person_home_ownership'].map(home_ownership_map)

# Verify the transformation
print("Transformed 'person_home_ownership' column:")
print(dataset['person_home_ownership'].head())

In [None]:
# Map to convert 'loan_intent' to numeric values
loan_intent_map = {
    'VENTURE': 0,
    'EDUCATION': 1,
    'DEBTCONSOLIDATION': 2,
    'HOMEIMPROVEMENT': 3,
    'MEDICAL': 4,
    'PERSONAL': 5
}

# Apply the mapping to the 'loan_intent' column
dataset['loan_intent'] = dataset['loan_intent'].map(loan_intent_map)

# Verify the transformation
print("Transformed 'loan_intent' column:")
print(dataset['loan_intent'].head())

In [None]:
# Map to convert 'loan_grade' to numeric values
loan_grade_map = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6
}

# Apply the mapping to the 'loan_grade' column
dataset['loan_grade'] = dataset['loan_grade'].map(loan_grade_map)

# Verify the transformation
print("Transformed 'loan_grade' column:")
print(dataset['loan_grade'].head())

In [None]:
# Map to convert 'cb_person_default_on_file' to numeric values
cb_person_default_map = {
    'Y': 1,
    'N': 0
}

# Apply the mapping to the 'cb_person_default_on_file' column
dataset['cb_person_default_on_file'] = dataset['cb_person_default_on_file'].map(cb_person_default_map)

# Verify the transformation
print("Transformed 'cb_person_default_on_file' column:")
print(dataset['cb_person_default_on_file'].head())

In [None]:
columns_to_plot = [col for col in dataset.columns if col != 'id']

import seaborn as sb
import matplotlib.pyplot as plt

sb.pairplot(dataset[columns_to_plot].dropna(), hue='loan_status')
plt.show()

In [None]:
plt.figure(figsize=(15, 15))

columns_to_plot = [col for col in dataset.columns if col != 'loan_status']

num_columns = len(columns_to_plot)
rows = (num_columns + 1) // 2  

for column_index, column in enumerate(columns_to_plot):
    plt.subplot(rows, 2, column_index + 1) 
    sb.violinplot(x='loan_status', y=column, data=dataset)

plt.tight_layout()  
plt.show()

In [None]:
# Split the dataset into training and testing sets with the same distribution of loan_status
from sklearn.model_selection import train_test_split

# Perform stratified sampling based on 'loan_status'
train_dataset, test_dataset = train_test_split(
    dataset, 
    test_size=0.25,  # 25% for testing
    random_state=1,  # For reproducibility
    stratify=dataset['loan_status']  # Maintain the same distribution of 'loan_status'
)

original_percentage = (dataset['loan_status'].value_counts(normalize=True) * 100).loc[1]
print("Original dataset distribution:")
print(f"Percentage of 1 in original dataset: {original_percentage:.2f}%")

# Print the percentage of 1 in the 'loan_status' column for the training dataset
train_percentage = (train_dataset['loan_status'].value_counts(normalize=True) * 100).loc[1]
print(f"Percentage of 1 in training dataset: {train_percentage:.2f}%")

# Print the percentage of 1 in the 'loan_status' column for the testing dataset
test_percentage = (test_dataset['loan_status'].value_counts(normalize=True) * 100).loc[1]
print(f"Percentage of 1 in testing dataset: {test_percentage:.2f}%")

In [None]:
# Save the training and testing datasets to CSV files
train_dataset.to_csv('data/train.csv', index=False)
test_dataset.to_csv('data/test.csv', index=False)

print("Training dataset saved as 'train.csv'.")
print("Testing dataset saved as 'test.csv'.")

In [None]:
#load dataset de traino e de teste
import pandas as pd

train_dataset = pd.read_csv('data/train.csv')
test_dataset = pd.read_csv('data/test.csv')

X_train = train_dataset.drop(columns=['loan_status'])
y_train = train_dataset['loan_status']
X_test = test_dataset.drop(columns=['loan_status'])
y_test = test_dataset['loan_status']


In [None]:
# Train a Decision Tree model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


# Initialize the Decision Tree Classifier
model = DecisionTreeClassifier(random_state=1)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = model.predict(X_test)

# Calculate and display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Decision Tree model: {accuracy:.2f}")

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10))
plot_tree(
    model, 
    feature_names=X_train.columns, 
    class_names=['0', '1'],  # Assuming binary classification
    filled=True, 
    rounded=True
)
plt.title("Decision Tree Visualization")
plt.show()

In [None]:
# Run the Decision Tree model 1000 times with different splits and display a histogram of accuracies
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

# Store accuracies for each run
accuracies = []

# Run the model 1000 times
for i in range(1000):
    # Split the dataset into training and testing sets with stratified sampling
    train_dataset, test_dataset = train_test_split(
        dataset,
        test_size=0.25,  # 25% for testing
        random_state=i,  # Change random state for each iteration
        stratify=dataset['loan_status']  # Maintain the same distribution of 'loan_status'
    )
    
    # Separate features (X) and target (y) for training and testing datasets
    X_train = train_dataset.drop(columns=['loan_status'])
    y_train = train_dataset['loan_status']
    X_test = test_dataset.drop(columns=['loan_status'])
    y_test = test_dataset['loan_status']
    
    # Initialize the Decision Tree Classifier
    model = DecisionTreeClassifier(random_state=i)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test dataset
    y_pred = model.predict(X_test)
    
    # Calculate and store the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

# Calculate and display the average accuracy over 1000 runs
average_accuracy = np.mean(accuracies)
print(f"Average accuracy over 1000 runs: {average_accuracy:.2f}")

# Plot a histogram of the accuracies
plt.hist(accuracies, bins=20, edgecolor='black')
plt.title('Histogram of Model Accuracies')
plt.xlabel('Accuracy')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Apply the K-Nearest Neighbors (KNN) algorithm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

k = 5

# Initialize the KNN classifier
knn_model = KNeighborsClassifier(n_neighbors=k)  

# Train the KNN model
knn_model.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = knn_model.predict(X_test)

# Calculate and display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the K-Nearest Neighbors model: {accuracy:.2f}")

In [None]:
# Apply the Support Vector Machine (SVM) algorithm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM model with scaled features
svm_model = SVC(kernel='linear', random_state=1)
svm_model.fit(X_train_scaled, y_train)

# Make predictions on the test dataset
y_pred = svm_model.predict(X_test)

# Calculate and display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Support Vector Machine model: {accuracy:.2f}")

In [None]:
# Apply a Neural Network using MLPClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=1)

# Train the Neural Network
nn_model.fit(X_train_scaled, y_train)

# Make predictions on the test dataset
y_pred = nn_model.predict(X_test_scaled)

# Calculate and display the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Neural Network model: {accuracy:.2f}")