**Function to read a training or test dataset and return a pandas data frame**

**Input:** fileid for the training data set or test dataset (csv file) that you downloaded to your computer

**Output:** a pandas dataframe

Note that the two datasets provided each consist of string fields.
- All numeric fields are converted to floats 
(even integer fields since we can't have missing values in a pandas int series)
- All string fields remain as objects.
- Dates are converted to datetime.



In [178]:
import pandas as pd
import numpy as np


def read_data(fileid): 
    stringvariables=["LOAN_ID","CHANNEL", "FIRST_TIME_HOME_BUYER_IND",
                  "LOAN_PURPOSE", "PROPERTY_TYPE", "OCCUPANCY_STATUS",
                  "PROPERTY_STATE", "ZIP_CODE_SHORT"]
    floatvariables=["NUMBER_OF_UNITS","FORECLOSURE","LTV",
                   "NUMBER_OF_BORROWERS","DTI","LOAN_AGE",
                   "REM_MONTHS_LEGAL_MATURITY", "REM_MONTHS_MATURITY",
                   "ORIGINAL_LOAN_TERM", "CLTV",
                   "B_CREDIT_SCORE_O","CB_CREDIT_SCORE_O", "NMONTHS",
                   "MSA","MORTGAGE_INSURANCE_PERCENTAGE",
                   "ORIGINAL_INTEREST_RATE", "CURRENT_INTEREST_RATE",
                   "ORIGINAL_UPB"]
    datevariables=["MONTHLY_REPORTING_PERIOD",
                   "ORIGINATION_DATE",
                   "FIRST_PAYMENT_DATE",
                   "MATURITY_DATE"]
    objectvariables=stringvariables + datevariables
    dtypes={}
    for s in objectvariables:
        dtypes[s]="object"
    for s in floatvariables:
        dtypes[s]="float"
    na_strings=["".join([" " for j in range(m)]) for m in range(10)]
    df=pd.read_csv(fileid,dtype=dtypes,na_values=na_strings)
    for datevar in datevariables:
        df[datevar]=pd.to_datetime(df[datevar],format="%m%Y")
    return(df)


In [183]:
# Import necessary modules
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder

def feature_encode(df, features):
    le = LabelEncoder()
    for feature in features:
        df[feature] = le.fit_transform(df[feature])
    return df

def fill_by_mean(df, features):
    for feature in features:
        df[feature].fillna(value=df[feature].mean(), inplace=True)
    return df

# Read the data
train_data = 'trainTXMM.csv'
test_data = 'test_predsTXMM.csv'

train_df = read_data(train_data)
test_df = read_data(test_data)

# Encode the categorical fields as numbers
category_cols = ['FIRST_TIME_HOME_BUYER_IND', 'LOAN_PURPOSE', 'PROPERTY_TYPE', 'CHANNEL', 'PROPERTY_STATE', 'OCCUPANCY_STATUS']
train_df = feature_encode(train_df, category_cols)
test_df = feature_encode(test_df, category_cols)

# remove records in train dataset contain NA value
train_df = train_df.dropna()

# fill the NA value for test dataset
na_features = ['CURRENT_INTEREST_RATE', 'LOAN_AGE', 'REM_MONTHS_LEGAL_MATURITY', 'REM_MONTHS_MATURITY', 'LTV', 'CLTV', 'NUMBER_OF_BORROWERS', 'DTI', 'B_CREDIT_SCORE_O']
test_df = fill_by_mean(test_df, na_features)
test_df.fillna(method='ffill', inplace=True)

# extract new feature from data
train_df['ORIGINATION_YEAR'] = pd.DatetimeIndex(train_df['ORIGINATION_DATE']).year
train_df['DURATION'] = pd.DatetimeIndex(train_df['MATURITY_DATE']).year - pd.DatetimeIndex(train_df['ORIGINATION_DATE']).year
test_df['ORIGINATION_YEAR'] = pd.DatetimeIndex(test_df['ORIGINATION_DATE']).year
test_df['DURATION'] = pd.DatetimeIndex(test_df['MATURITY_DATE']).year - pd.DatetimeIndex(test_df['ORIGINATION_DATE']).year

# Select the response variable and the input features
y_train = train_df['NMONTHS']
X_train = train_df.drop(['NMONTHS', 'LOAN_ID', 'MONTHLY_REPORTING_PERIOD', 'ORIGINATION_DATE', 'FIRST_PAYMENT_DATE', 'MATURITY_DATE', 'FORECLOSURE', 'CB_CREDIT_SCORE_O', 'MORTGAGE_INSURANCE_PERCENTAGE'], axis=1)
X_test = test_df.drop(['LOAN_ID', 'MONTHLY_REPORTING_PERIOD', 'ORIGINATION_DATE', 'FIRST_PAYMENT_DATE', 'MATURITY_DATE',  'CB_CREDIT_SCORE_O', 'MORTGAGE_INSURANCE_PERCENTAGE'], axis=1)

# Evaluate the importances of the input features
model = RandomForestRegressor()
model.fit(X_train, y_train)
importances = model.feature_importances_

# Select the features having significant impact on the response variable
significant_features = []
for i, importance in enumerate(importances):
    if importance >= 0.01:
        significant_features.append(X_train.columns[i])

# Build the regression model using the significant features
X_selected = X_train[significant_features]
X_test = X_test[significant_features]

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X_selected, y_train, test_size=0.2, random_state=42)

# Normalize fetures
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train)
X_test = preprocessing.StandardScaler().fit(X_test).transform(X_test)
X_val = preprocessing.StandardScaler().fit(X_val).transform(X_val)

# Train the model
NMONTHS_model = RandomForestRegressor()
NMONTHS_model.fit(X_train, y_train)

# Evaluate the model performance on the testing set
y_pred = NMONTHS_model.predict(X_val)
mse = mean_absolute_error(y_val, y_pred)
print('Mean absolute error:', mse)

Mean absolute error: 34.03310512747064


In [184]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import resample

y_train = train_df['FORECLOSURE']

# Oversample the data to balance the labels
X_oversampled, y_oversampled = resample(X_selected[y_train==1], y_train[y_train==1], n_samples=len(X_selected[y_train==0]), random_state=42)
X_resampled = pd.concat([X_selected[y_train==0], X_oversampled])
y_resampled = pd.concat([y_train[y_train==0], y_oversampled])

# Split the data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2)

# Normalize fetures
X_train = preprocessing.StandardScaler().fit(X_train).transform(X_train)
X_val = preprocessing.StandardScaler().fit(X_val).transform(X_val)

# Build the logistic regression model
FORECLOSURE_model = LogisticRegression(solver='lbfgs', max_iter=500)
FORECLOSURE_model.fit(X_train, y_train)

# Evaluate the model on the test data
y_pred = FORECLOSURE_model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.72      0.70      0.71      3358
         1.0       0.71      0.72      0.71      3318

    accuracy                           0.71      6676
   macro avg       0.71      0.71      0.71      6676
weighted avg       0.71      0.71      0.71      6676



In [189]:
NMONTHS_result = NMONTHS_model.predict(X_test)
FORECLOSURE_result = FORECLOSURE_model.predict_proba(X_test)
BOUNDRY_VALUE = np.sort(FORECLOSURE_result[:,0], axis=None)[1000]
print(BOUNDRY_VALUE)
FORECLOSURE_result[:,0] = FORECLOSURE_result[:,0] < BOUNDRY_VALUE
FORECLOSURE_result[:,1] = FORECLOSURE_result[:,1] > 1
LOAN_ID = test_df['LOAN_ID']
result_tuples = list(zip(LOAN_ID, NMONTHS_result, FORECLOSURE_result[:,0]))
df = pd.DataFrame(result_tuples,
                  columns=['LOAN_ID', 'NMONTHS', 'FORECLOSURE'])
df.to_csv('preds.csv', index=False) 

0.04254352865513211
