In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Step 1: Exploratory Data Analysis
train_df = pd.read_csv(r"C:\Users\musta\Downloads\train_ctrUa4K.csv")
test_df = pd.read_csv(r"C:\Users\musta\Downloads\test_lAUu6dG.csv")

In [3]:
train_df.head()
test_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [6]:
train_df.isna().sum()
test_df.isna().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [7]:
# Step 2: Pre-processing
# Handle missing values
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)
train_df['Dependents'].fillna(train_df['Dependents'].mode()[0], inplace=True)
test_df['Dependents'].fillna(test_df['Dependents'].mode()[0], inplace=True)


  train_df.fillna(train_df.mean(), inplace=True)
  test_df.fillna(test_df.mean(), inplace=True)


In [8]:
train_df.isna().sum()
test_df.isna().sum()

Loan_ID               0
Gender               11
Married               0
Dependents            0
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
dtype: int64

In [9]:
test_df['Gender'].fillna(test_df['Gender'].mode()[0], inplace=True)
test_df['Dependents'].fillna(test_df['Dependents'].mode()[0], inplace=True)
test_df['Self_Employed'].fillna(test_df['Self_Employed'].mode()[0], inplace=True)

# Verify if there are any remaining missing values
test_df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [10]:
# Convert categorical variables using LabelEncoder
le = LabelEncoder()
categorical_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']
for col in categorical_cols:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [11]:
# Preprocess 'Dependents' column to handle '3+' value
train_df['Dependents'] = train_df['Dependents'].replace('3+', 100).astype(float)
test_df['Dependents'] = test_df['Dependents'].replace('3+', 100).astype(float)

In [12]:
# Drop rows with missing values
train_df.dropna(inplace=True)

In [13]:
# Convert target variable using LabelEncoder
train_df['Loan_Status'] = le.fit_transform(train_df['Loan_Status'])

In [14]:
# Split train_df into features (X) and target variable (y)
X = train_df.drop(['Loan_ID', 'Loan_Status'], axis=1)
y = train_df['Loan_Status']

In [15]:
# Step 3: Modelling
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


RandomForestClassifier(random_state=42)

In [16]:
# Step 4: Fine-tuning
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [17]:
# Step 5: Make predictions
X_test = test_df.drop('Loan_ID', axis=1)
predictions = best_model.predict(X_test)

In [27]:
# Replace 'Loan_Status' column in the sample submission file
submission_df = pd.read_csv(r"C:\Users\musta\Downloads\sample_submission_49d68Cx.csv")
submission_df['Loan_Status'] = predictions

In [28]:
# Save the updated submission file as CSV
submission_df.to_csv("path_to_updated_submission_file.csv", index=False)

In [29]:
update_df=pd.read_csv(r"C:\Users\musta\path_to_updated_submission_file.csv")

In [30]:
update_df.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,1


In [32]:
import pandas as pd

# Specify the path to the original CSV file
original_file_path = r"C:\Users\musta\Downloads\datasets\path_to_updated_submission_file.csv"

# Specify the path to the updated CSV file
updated_file_path = "path_to_updated_submission_file.csv"


# Read the original CSV file
df = pd.read_csv(original_file_path)

# Update the Loan_Status values
df['Loan_Status'] = df['Loan_Status'].map({1: 'Yes', 0: 'No'})

# Save the updated data to the new CSV file
df.to_csv(updated_file_path, index=False)

print("CSV file updated successfully!")

CSV file updated successfully!
