In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
train_data = pd.read_csv('/content/Assignment_Train.csv')
test_data = pd.read_csv('/content/Assignment_Test.csv')
feature_dict = pd.read_excel('/content/Assignment_FeatureDictionary.xlsx')


In [4]:
train_data_preview = train_data.head()
test_data_preview = test_data.head()
feature_dict_preview = feature_dict.head()

In [5]:
train_data_preview, test_data_preview, feature_dict_preview

(   DEALER ID APPLICATION LOGIN DATE HDB BRANCH NAME HDB BRANCH STATE  \
 0     106989             07/20/2022        DELHI-SF            DELHI   
 1     108975             07/28/2022        PATNA-SF            BIHAR   
 2     111004             07/15/2022   DARJEELING-SF      WEST BENGAL   
 3     192020               07/04/22   SAHARANPUR-SF    UTTAR PRADESH   
 4      55095             07/15/2022       MODASA-SF          GUJARAT   
 
   FIRST NAME MIDDLE NAME  LAST NAME      mobile AADHAR VERIFIED Cibil Score  \
 0      SUNIL         NaN    CHANDER  9210574080              NO         726   
 1      AMRIT         NaN      KUMAR  8877987018              NO         NaN   
 2    ANIMESH         NaN      THAPA  8910862135              NO         737   
 3     ADITYA         NaN      SINGH  9758428017              NO         713   
 4     PARMAR  HARESHBHAI  AMRUTBHAI  9687028486              NO         669   
 
    ...  Phone Social Premium.shaadi Phone Social Premium.skype  \
 0  ...    

The datasets consist of the following:

Training Data: Contains 55 columns, including features such as dealer information, customer details (e.g., mobile, AADHAR verification, credit scores), and several social and digital footprint features. The target variable is "Application Status" which indicates whether the loan application was accepted or rejected.

Test Data: Contains the same columns as the training data, minus the "Application Status" column.

Feature Dictionary: Provides descriptions of the various columns in the datasets, such as "DEALER ID" and "APPLICATION LOGIN DATE."


In [6]:
# Checking for missing values and basic statistics in the training data
missing_values = train_data.isnull().sum()
train_data_stats = train_data.describe(include='all')

In [8]:
missing_values, train_data_stats

(DEALER ID                                 0
 APPLICATION LOGIN DATE                    0
 HDB BRANCH NAME                           1
 HDB BRANCH STATE                        854
 FIRST NAME                                0
 MIDDLE NAME                            7145
 LAST NAME                               681
 mobile                                    0
 AADHAR VERIFIED                           0
 Cibil Score                            4297
 MOBILE VERIFICATION                       0
 DEALER NAME                               4
 TOTAL ASSET COST                       5108
 ASSET CTG                              5108
 ASSET MODEL NO                            0
 APPLIED AMOUNT                            0
 PRIMARY ASSET MAKE                        0
 Primary Asset Model No                    0
 Personal Email Address                    0
 MARITAL STATUS                         4894
 GENDER                                    0
 DOB                                       0
 AGE      

The training data contains significant amounts of missing values in several columns, particularly in fields such as:

HDB BRANCH STATE (854 missing values)
MIDDLE NAME (7,145 missing values)
Cibil Score (4,297 missing values)
TOTAL ASSET COST and ASSET CTG (both have 5,108 missing values)
Several "Phone Social Premium" columns also have a large number of missing values.


In [10]:
# Handle Missing Values
# Impute missing numerical values with the mean and categorical values with the most frequent value
num_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
cat_cols = train_data.select_dtypes(include=['object']).columns

In [11]:
# Imputation for numerical and categorical columns
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_data[num_cols] = num_imputer.fit_transform(train_data[num_cols])
train_data[cat_cols] = cat_imputer.fit_transform(train_data[cat_cols])

In [12]:
# Encode categorical variables
label_encoders = {}
for col in cat_cols:
    if train_data[col].dtype == 'object':
        le = LabelEncoder()
        train_data[col] = le.fit_transform(train_data[col])
        label_encoders[col] = le

In [13]:
# Feature Scaling
scaler = StandardScaler()
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])


In [14]:
# Checking the column names of the training data to locate the identifier column
column_names = train_data.columns
column_names


Index(['DEALER ID', 'APPLICATION LOGIN DATE', 'HDB BRANCH NAME',
       'HDB BRANCH STATE', 'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'mobile',
       'AADHAR VERIFIED', 'Cibil Score', 'MOBILE VERIFICATION', 'DEALER NAME',
       'TOTAL ASSET COST', 'ASSET CTG', 'ASSET MODEL NO', 'APPLIED AMOUNT',
       'PRIMARY ASSET MAKE', 'Primary Asset Model No',
       'Personal Email Address', 'MARITAL STATUS', 'GENDER', 'DOB', 'AGE',
       'ADDRESS TYPE', 'EMPLOY CONSTITUTION', 'EMPLOYER NAME', 'EMPLOYER TYPE',
       'Pan Name', 'name', 'vpa', 'upi_name', 'Phone Social Premium.a23games',
       'Phone Social Premium.amazon', 'Phone Social Premium.byjus',
       'Phone Social Premium.flipkart', 'Phone Social Premium.housing',
       'Phone Social Premium.indiamart', 'Phone Social Premium.instagram',
       'Phone Social Premium.isWABusiness',
       'Phone Social Premium.jeevansaathi', 'Phone Social Premium.jiomart',
       'Phone Social Premium.microsoft', 'Phone Social Premium.my11',
       

In [15]:
X = train_data.drop(columns=['Application Status'])  # Dropping the target column
y = train_data['Application Status']

In [16]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [18]:
# Evaluate the model on the validation set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
classification_rep = classification_report(y_val, y_pred)

In [19]:
accuracy, classification_rep

(0.886,
 '              precision    recall  f1-score   support\n\n           0       0.90      0.93      0.92      1327\n           1       0.85      0.80      0.82       673\n\n    accuracy                           0.89      2000\n   macro avg       0.88      0.86      0.87      2000\nweighted avg       0.88      0.89      0.89      2000\n')

The Random Forest model achieved an accuracy of 88.6% on the validation set, with the following performance metrics:

Precision, Recall, and F1-score for class 0 (e.g., rejected applications) and class 1 (e.g., approved applications) are relatively high

Precision for class 0: 0.90

Recall for class 0: 0.93

Precision for class 1: 0.85

Recall for class 1: 0.80

# Comparing results of multiple models

In this analysis, we aim to determine the best model for predicting whether a two-wheeler loan application would be accepted or rejected. We experiment with multiple machine learning models, including Random Forest, Support Vector Machine (SVM), Decision Tree, and Gradient Boosting Machine (GBM). The objective is to compare the performance of these models and select the one that offers the highest accuracy and balanced performance across various metrics, such as precision, recall, and F1-score.


In [20]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# Initialize the models
svm_model = SVC(random_state=42)
decision_tree_model = DecisionTreeClassifier(random_state=42)
gbm_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Dictionary to store the results of each model
results = {}

# Train and evaluate the SVM model
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_val)
results['SVM'] = {
    'accuracy': accuracy_score(y_val, y_pred_svm),
    'classification_report': classification_report(y_val, y_pred_svm)
}

# Train and evaluate the Decision Tree model
decision_tree_model.fit(X_train, y_train)
y_pred_tree = decision_tree_model.predict(X_val)
results['Decision Tree'] = {
    'accuracy': accuracy_score(y_val, y_pred_tree),
    'classification_report': classification_report(y_val, y_pred_tree)
}

# Train and evaluate the GBM model
gbm_model.fit(X_train, y_train)
y_pred_gbm = gbm_model.predict(X_val)
results['GBM'] = {
    'accuracy': accuracy_score(y_val, y_pred_gbm),
    'classification_report': classification_report(y_val, y_pred_gbm)
}

results

Parameters: { "use_label_encoder" } are not used.



{'SVM': {'accuracy': 0.6885,
  'classification_report': '              precision    recall  f1-score   support\n\n           0       0.69      0.95      0.80      1327\n           1       0.64      0.17      0.27       673\n\n    accuracy                           0.69      2000\n   macro avg       0.67      0.56      0.54      2000\nweighted avg       0.67      0.69      0.62      2000\n'},
 'Decision Tree': {'accuracy': 0.856,
  'classification_report': '              precision    recall  f1-score   support\n\n           0       0.89      0.89      0.89      1327\n           1       0.79      0.78      0.79       673\n\n    accuracy                           0.86      2000\n   macro avg       0.84      0.84      0.84      2000\nweighted avg       0.86      0.86      0.86      2000\n'},
 'GBM': {'accuracy': 0.892,
  'classification_report': '              precision    recall  f1-score   support\n\n           0       0.92      0.92      0.92      1327\n           1       0.84      0.83

### Model Comparison

After training and evaluating each model, here are the results:

#### SVM:
- **Accuracy**: 68.85%
- **Precision**:
  - Class 0: 0.69
  - Class 1: 0.64
- **Recall**:
  - Class 0: 0.95
  - Class 1: 0.17
- **F1-Score**:
  - Class 0: 0.80
  - Class 1: 0.27

#### Decision Tree:
- **Accuracy**: 85.6%
- **Precision**:
  - Class 0: 0.89
  - Class 1: 0.79
- **Recall**:
  - Class 0: 0.89
  - Class 1: 0.78
- **F1-Score**:
  - Class 0: 0.89
  - Class 1: 0.79

#### Random Forest (RFC):
- **Accuracy**: 88.6%
- **Precision**:
  - Class 0: 0.90
  - Class 1: 0.85
- **Recall**:
  - Class 0: 0.93
  - Class 1: 0.80
- **F1-Score**:
  - Class 0: 0.92
  - Class 1: 0.82

#### GBM:
- **Accuracy**: 89.2%
- **Precision**:
  - Class 0: 0.92
  - Class 1: 0.84
- **Recall**:
  - Class 0: 0.92
  - Class 1: 0.83
- **F1-Score**:
  - Class 0: 0.92
  - Class 1: 0.84

### Conclusion

Based on the comparison, the **GBM** model exhibited the best performance, achieving the highest accuracy (89.2%) along with strong precision, recall, and F1-scores for both classes. The **Random Forest Classifier (RFC)** also performed well but slightly underperformed compared to GBM. Therefore, we proceed with the GBM model for generating predictions on the test data.


## Generate predictions

In [30]:
missing_cols_in_test = set(X.columns) - set(test_data.columns)
extra_cols_in_test = set(test_data.columns) - set(X.columns)

for col in missing_cols_in_test:
    test_data[col] = 0

test_data = test_data.drop(columns=extra_cols_in_test, errors='ignore')

test_data = test_data[X.columns]

test_predictions = model.predict(test_data)

submission = pd.DataFrame({
    'UID': test_data['DEALER ID'],
    'Prediction': test_predictions
})

submission_file_path = '/content/predictions.csv'
submission.to_csv(submission_file_path, index=False)

submission_file_path


'/content/predictions.csv'

# Fixing issues with submission file

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OrdinalEncoder


train_data = pd.read_csv('/content/Assignment_Train.csv')
test_data = pd.read_csv('/content/Assignment_Test.csv')

num_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
cat_cols = train_data.select_dtypes(include=['object']).columns

cat_cols = [col for col in cat_cols if col != 'Application Status']

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_data[num_cols] = num_imputer.fit_transform(train_data[num_cols])
train_data[cat_cols] = cat_imputer.fit_transform(train_data[cat_cols])

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train_data[cat_cols] = ordinal_encoder.fit_transform(train_data[cat_cols])

scaler = StandardScaler()
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])

X = train_data.drop(columns=['Application Status'])
y = train_data['Application Status']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

y_pred_val = rfc.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred_val))
print("Classification Report:\n", classification_report(y_val, y_pred_val))

cat_cols_test = [col for col in cat_cols if col in test_data.columns]

test_data[num_cols] = num_imputer.transform(test_data[num_cols])
test_data[cat_cols_test] = cat_imputer.transform(test_data[cat_cols_test])

test_data[cat_cols_test] = ordinal_encoder.transform(test_data[cat_cols_test])

test_data[num_cols] = scaler.transform(test_data[num_cols])

test_data_features = test_data.drop(columns=['UID'])

test_predictions = rfc.predict(test_data_features)

submission = pd.DataFrame({
    'UID': test_data['UID'],
    'Prediction': test_predictions
})

submission_file_path = '/content/predictions2.csv'
submission.to_csv(submission_file_path, index=False)

print("Predictions saved to:", submission_file_path)


Validation Accuracy: 0.886
Classification Report:
               precision    recall  f1-score   support

    APPROVED       0.90      0.93      0.92      1327
    DECLINED       0.85      0.80      0.82       673

    accuracy                           0.89      2000
   macro avg       0.88      0.86      0.87      2000
weighted avg       0.88      0.89      0.89      2000

Predictions saved to: /content/predictions2.csv
