In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import pickle

In [None]:
df_cleaned = pd.read_csv('temp.csv', sep='|')

In [None]:
date_columns = [
    'activation_date',
    # 'office_joining_date',
    'date_of_birth',
    'approvedon_date',
    'expected_disbursedon_date',
    'disbursedon_date',
    'expected_maturedon_date',
    'maturedon_date',
    'transaction_date',
    'submitted_on_date'
]
numerical_columns = [
    # 'has_email_address',
    'gender_cv_id',
    'legal_form_enum',
    'interest_period_frequency_enum',
    'interest_method_enum',
    'interest_calculated_in_period_enum',
    'term_frequency',
    'number_of_repayments',
    'transaction_type_enum',
    # 'status_enum',
    'principal_amount',
    'nominal_interest_rate_per_period',
    'annual_nominal_interest_rate',
    'principal_repaid_derived',
    'principal_outstanding_derived',
    'interest_charged_derived',
    'interest_repaid_derived',
    'interest_outstanding_derived',
    'total_repayment_derived',
    'total_costofloan_derived',
    'total_outstanding_derived',
    'amount',
    'principal_portion_derived',
    'outstanding_loan_balance_derived'
]
other_columns_not_encoded = [
    'has_mobile_no',
    'validatedon_userid',
    'loan_transaction_strategy_id',
    'is_reversed',
    'submittedon_date_client',
    'submittedon_date_loan',
    'validatedon_date',
    'created_date',
    'principal_amount_proposed',
    'principal_disbursed_derived',
    'total_expected_repayment_derived',
    'total_expected_costofloan_derived',
    'manually_adjusted_or_reversed',
    'has_email_address',
    'status_enum',
    'office_joining_date'
]

In [None]:
df_cleaned = df_cleaned.drop(columns=other_columns_not_encoded)
print(type(df_cleaned['activation_date'][0]))
print(df_cleaned['activation_date'][0])
for col in date_columns:
    df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='coerce')

# Find the minimum date
min_date = df_cleaned[date_columns].min().min()
reference_date = pd.to_datetime(min_date)

# Fill NaNs in date columns with reference date and convert to days since reference date
for col in date_columns:
    df_cleaned[col] = df_cleaned[col].fillna(reference_date)
    df_cleaned[col] = (df_cleaned[col] - reference_date).dt.days

# Convert specified columns to 'category' dtype
df_encoded = df_cleaned.copy()

In [None]:
print(reference_date)

In [None]:
missing_values = df_encoded.isnull().mean() * 100
missing_values = missing_values[missing_values > 0]
print(f"Number of columns with missing values: {len(missing_values)}")
#  name of columns with missing values
print(missing_values.index)
# remove columns with missing values
df_encoded = df_encoded.drop(columns=missing_values.index)
print(df_encoded.shape)

In [None]:
# correlation matrix
correlation_matrix = df_encoded.corr()
# Select the top 10 features that are most correlated with the target variable
target_variable = 'nominal_interest_rate_per_period'
target_correlation = correlation_matrix[target_variable].sort_values(ascending=False)

In [None]:
top_10_correlated_features = target_correlation[1:11]
print(top_10_correlated_features)
bottom_10_correlated_features = target_correlation[-10:]
print(bottom_10_correlated_features)


In [None]:
highly_corr_future_related = [
    "annual_nominal_interest_rate",
    "number_of_repayments",
    "term_frequency",
    "total_costofloan_derived",
    "total_repayment_derived",
    "principal_repaid_derived",
    'principal_outstanding_derived', 
    'interest_charged_derived', 
    'interest_outstanding_derived', 
    'total_outstanding_derived', 
    'outstanding_loan_balance_derived',
    'interest_repaid_derived',
]


In [None]:
df_encoded = df_encoded.drop(columns=highly_corr_future_related)
print(df_encoded.shape)

In [None]:
print(df_encoded.columns)

In [None]:
X = df_encoded.drop('nominal_interest_rate_per_period', axis=1)
y = df_encoded['nominal_interest_rate_per_period']

In [None]:
# print average values of all columns of X
print(X.mean())


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

mse = np.mean((lr.predict(X_test) - y_test) ** 2)
print(f"Mean Squared Error: {mse}")

In [None]:
# Save the Linear Regression model
with open('linear_regression_model_new.pkl', 'wb') as file:
    pickle.dump(lr, file)

In [None]:
# Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

mse = np.mean((dt.predict(X_test) - y_test) ** 2)
print(f"Mean Squared Error: {mse}")

In [None]:
# Save the Decision Tree model
with open('decision_tree_regressor_model_new.pkl', 'wb') as file:
    pickle.dump(dt, file)

In [None]:
X_random = X_train.sample(frac=0.4, random_state=42)
y_random = y_train[X_random.index]

# Random Forest Regressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_random, y_random)

mse = np.mean((rf.predict(X_test) - y_test) ** 2)
print(f"Mean Squared Error: {mse}")

In [None]:
# Save the Random Forest model
with open('random_forest_regressor_model_new.pkl', 'wb') as file:
    pickle.dump(rf, file)