In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [26]:
# Load the datasets
train_df = pd.read_csv('C:/Users/meet9/MMA865/Week 1/kiva_train.csv')
test_df = pd.read_csv('C:/Users/meet9/MMA865/Week 1/kiva_test.csv')

# Display basic information about the data
print("Train Data Info:")
print(train_df.info())
print("Test Data Info:")
print(test_df.info())

Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5454 entries, 0 to 5453
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           5454 non-null   int64 
 1   country      5454 non-null   object
 2   en           5454 non-null   object
 3   gender       5454 non-null   object
 4   loan_amount  5454 non-null   int64 
 5   nonpayment   5454 non-null   object
 6   sector       5454 non-null   object
 7   status       5454 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 341.0+ KB
None
Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1364 entries, 0 to 1363
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           1364 non-null   int64 
 1   country      1364 non-null   object
 2   en           1364 non-null   object
 3   gender       1364 non-null   object
 4   loan_amount  1364 non-null   int64 
 

In [27]:
columns_to_exclude = ['id', 'status']

# Custom transformer to select a specific column
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.column_name]

# Pipeline for text data
text_pipeline = Pipeline([
    ('selector', ColumnSelector('en')),
    ('tfidf', TfidfVectorizer())
])

# Pipeline for categorical data
categorical_pipeline = Pipeline([
    ('selector', ColumnSelector(['country', 'gender', 'nonpayment', 'sector'])),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine the pipelines
preprocessor = FeatureUnion(transformer_list=[
    ('text', text_pipeline),
    ('cat', categorical_pipeline)
])

In [31]:
preprocessor

In [28]:
# Prepare the features and target variable
X = train_df.drop(columns=columns_to_exclude)
y = train_df['status']

# Fit and transform the preprocessor on the training data
X_preprocessed = preprocessor.fit_transform(X)

# Check the dimensions after preprocessing
print(f'Preprocessed data shape: {X_preprocessed.shape}')

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

Preprocessed data shape: (5454, 17942)


In [29]:
# Train the model
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Predict on validation set and evaluate
y_val_pred = classifier.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'Validation ROC AUC: {roc_auc:.4f}')

Validation ROC AUC: 0.9748


In [30]:
# Prepare the test features (excluding 'id')
X_test_input = test_df.drop(columns=['id'])

# Preprocess the test data using the same preprocessor used for training
X_test_preprocessed = preprocessor.transform(X_test_input)

# Predict probabilities of the positive class (status = 1)
test_predictions = classifier.predict_proba(X_test_preprocessed)[:, 1]

submission_output = pd.DataFrame({
    'id': test_df['id'],       # Use 'id' column from the test data
    'status': test_predictions  # Use predicted probabilities as the 'status' values
})

submission_output.head()

Unnamed: 0,id,status
0,5455,0.55
1,5456,0.72
2,5457,0.42
3,5458,0.77
4,5459,0.29


In [24]:
# Save the submission dataframe to a CSV file
submission_output.to_csv('C:/Users/meet9/Downloads/submission.csv', index=False)

print("Submission file created: 'submission.csv'")

Submission file created: 'submission.csv'
