In [None]:
import pandas as pd

df = pd.read_csv('robust_student_loan_dataset.csv')




In [None]:
df.columns = df.columns.str.strip()


In [None]:
df.rename(columns={
    'Loan Amount (USD)': 'Loan Amount',
    'Monthly Income (USD)': 'Monthly Income',
    'Monthly Expenses (USD)': 'Monthly Expenses',
    'Interest Rate (%)': 'Interest Rate',
    'Monthly Installment (USD)': 'Monthly Installment'
}, inplace=True)

In [None]:
print(df[['Monthly Income', 'Monthly Expenses', 'Loan Amount', 'Monthly Installment']].info())


In [None]:
df['Monthly Income'] = pd.to_numeric(df['Monthly Income'], errors='coerce')
df['Monthly Expenses'] = pd.to_numeric(df['Monthly Expenses'], errors='coerce')
df['Loan Amount'] = pd.to_numeric(df['Loan Amount'], errors='coerce')
df['Monthly Installment'] = pd.to_numeric(df['Monthly Installment'], errors='coerce')


In [None]:

df['Disposable Income'] = df['Monthly Income'] - df['Monthly Expenses']
df['Loan Tenure (Months)'] = (df['Loan Amount'] / df['Monthly Installment']).round()
df['Total Interest Paid'] = (df['Loan Tenure (Months)'] * df['Monthly Installment']) - df['Loan Amount']


In [None]:
df.head()

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_features = ['Loan Amount', 'Monthly Income', 'Monthly Expenses', 'Interest Rate', 'Disposable Income']
df[numeric_features] = scaler.fit_transform(df[numeric_features])


In [None]:
X = df.drop(['Loan Tenure (Months)'], axis=1)
y = df['Loan Tenure (Months)']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
df = df.dropna()

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # or strategy='median'
df_imputed = imputer.fit_transform(df)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with the mean
    ('model', LinearRegression())  # Linear regression model
])

pipeline.fit(X_train, y_train)  # Fit the model


In [None]:
print(df.isnull().sum())  # Check for missing values in each column


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Check for missing values in the DataFrame
missing_values = df.isnull().sum()
print(f"Missing values in each column:\n{missing_values}")

# Impute missing values if any
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Define features (X) and target (y)
X = df_imputed[['Loan Amount', 'Monthly Income', 'Monthly Expenses', 'Interest Rate', 'Monthly Installment']]
y = df_imputed['Loan Tenure (Months)']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predictions and evaluation
y_pred_lr = lr.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print(f"Linear Regression Mean Absolute Error: {mae_lr}")

# Train Random Forest Model with GridSearchCV
rf = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5]
}
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

# Predictions and evaluation
y_pred_rf = best_rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"Random Forest Mean Absolute Error: {mae_rf}")




In [None]:
# Visualization of results
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting feature importances from the best Random Forest model
feature_importances = best_rf.feature_importances_
features = X.columns

plt.figure(figsize=(8, 6))
sns.barplot(x=features, y=feature_importances)
plt.title("Feature Importances - Random Forest")
plt.xlabel("Features")
plt.ylabel("Importance")
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = df_imputed.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


In [None]:
rf = RandomForestRegressor(random_state=42, max_depth=5, n_estimators=50)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"Updated Random Forest Mean Absolute Error: {mae_rf}")


In [None]:
df_imputed['Debt_to_Income_Ratio'] = df_imputed['Monthly Expenses'] / df_imputed['Monthly Income']
df_imputed['Disposable_Income_Ratio'] = df_imputed['Disposable Income (USD)'] / df_imputed['Monthly Income']


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest RMSE: {mse_rf**0.5}")
print(f"Random Forest R² Score: {r2_rf}")


In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"XGBoost Mean Absolute Error: {mae_xgb}")


In [None]:
import matplotlib.pyplot as plt
from xgboost import plot_importance

plot_importance(xgb)
plt.title('Feature Importance - XGBoost')
plt.show()


In [24]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2]
}

xgb = XGBRegressor(random_state=42)
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_xgb = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)


Best parameters: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200}


In [25]:
import joblib

joblib.dump(xgb, 'xgboost_model.pkl')


['xgboost_model.pkl']

In [26]:
xgb_loaded = joblib.load('xgboost_model.pkl')
xgb.fit(X_train, y_train)
joblib.dump(xgb, 'xgboost_model.pkl') 
xgb.save_model('xgboost_model.json')


In [27]:
from xgboost import XGBRegressor

xgb_loaded = XGBRegressor()
xgb_loaded.load_model('xgboost_model.json')


In [28]:

custom_input = {
    'Loan Amount': 50000,
    'Monthly Income': 10,
    'Monthly Expenses': 2000,
    'Interest Rate': 5,
    'Monthly Installment': 1000
}

new_data = pd.DataFrame([custom_input])  # Ensure structure matches training
predictions = xgb_loaded.predict(new_data)
print(predictions[0])
loan_tenure_months = predictions[0]  # The predicted loan tenure in months
loan_tenure_years = loan_tenure_months / 12
print(f"Predicted Loan Tenure: {loan_tenure_years} years")




70.00005
Predicted Loan Tenure: 5.833337783813477 years


In [32]:
import numpy as np

def fluctuate_income(custom_input, fluctuation_range=(1, 10000), seed=None):
    if seed is not None:
        np.random.seed(seed)  # Set the random seed for reproducibility
    income = custom_input['Monthly Income']
    fluctuation = np.random.uniform(*fluctuation_range)
    custom_input_copy = custom_input.copy()
    custom_input_copy['Monthly Income'] = income * (1 + fluctuation)
    return custom_input_copy

# Example usage with fixed seed for reproducibility
seed_value = 10000000  # Choose any integer seed value
fluctuated_data = fluctuate_income(custom_input, fluctuation_range=(0, 4), seed=seed_value)
new_data = pd.DataFrame([fluctuated_data])
prediction = xgb_loaded.predict(new_data)
print(f"Fluctuated Income = {fluctuated_data['Monthly Income']}, Predicted Tenure = {prediction[0] / 12:.3f} years")



Fluctuated Income = 15.52430708314072, Predicted Tenure = 5.833 years
