# **Random Forest**

Log-loss Score obtained: **0.68696**

## **Setup**

In [1]:
!git clone https://github.com/yeray142/AIML_project

Cloning into 'AIML_project'...
remote: Enumerating objects: 53, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 53 (delta 17), reused 16 (delta 1), pack-reused 0[K
Unpacking objects: 100% (53/53), done.


In [2]:
cd AIML_project

/content/AIML_project


In [175]:
import pandas as pd
import numpy as np

train = pd.read_csv("../data/rf_train.csv", index_col=0)
test = pd.read_csv("../data/rf_test.csv", index_col=0)

In [176]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

## **Data preprocessing**

In [177]:
train.head()

Unnamed: 0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,days_birth,days_employed,work_phone,home_phone,email,occup_type,family_size,begin_month,credit
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,0,0,0,No job,2.0,6,1.0
1,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,0,1,0,Sales staff,2.0,37,0.0
2,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,15037,2105,0,0,0,Managers,2.0,26,2.0
3,F,N,Y,2,270000.0,Working,Secondary / secondary special,Married,House / apartment,13413,4996,0,0,1,High skill tech staff,4.0,18,1.0
4,F,N,N,0,315000.0,Working,Secondary / secondary special,Separated,House / apartment,17570,1978,0,0,1,Core staff,1.0,41,2.0


In [178]:
test.head()

Unnamed: 0_level_0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,days_birth,days_employed,work_phone,home_phone,email,occup_type,family_size,begin_month
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
26457,M,Y,N,0,112500.0,Pensioner,Secondary / secondary special,Civil marriage,House / apartment,21990,0,0,1,0,No job,2.0,60
26458,F,N,Y,0,135000.0,State servant,Higher education,Married,House / apartment,18964,8671,0,1,0,Core staff,2.0,36
26459,F,N,Y,0,69372.0,Working,Secondary / secondary special,Married,House / apartment,15887,217,1,1,0,Laborers,2.0,40
26460,M,Y,N,0,112500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,19270,2531,1,0,0,Drivers,2.0,41
26461,F,Y,Y,0,225000.0,State servant,Higher education,Married,House / apartment,17822,9385,1,0,0,Managers,2.0,8


### Feature Engineering

In [179]:
for x in [train, test]:
    # Adding Age, birth month and birth week.
    x['Age'] = x['days_birth'] // 365
    x['birth_month'] = np.floor(x['days_birth'] / 30) - ((np.floor(x['days_birth'] / 30) / 12).astype(int) * 12)
    x['birth_week'] = np.floor(x['days_birth'] / 7) - ((np.floor(x['days_birth'] / 7) / 4).astype(int) * 4)

    # Same with days_birth but with days_employed.
    x['ages_employed'] = x['days_employed'] // 365
    x['employ_month'] = np.floor(x['days_employed'] / 30) - ((np.floor(x['days_employed'] / 30) / 12).astype(int) * 12)
    x['employ_week'] = np.floor(x['days_employed'] / 7) - ((np.floor(x['days_employed'] / 7) / 4).astype(int) * 4)

    # Actual income per family member.
    x["income_family"] = x["income_total"] / x["family_size"]

    # Handling with multicollinearity
    cols = ['child_num', 'days_birth', 'days_employed']
    x.drop(cols, axis=1, inplace=True)

In [150]:
train.columns

Index(['gender', 'car', 'reality', 'income_total', 'income_type', 'edu_type',
       'family_type', 'house_type', 'work_phone', 'home_phone', 'email',
       'occup_type', 'family_size', 'begin_month', 'credit', 'Age',
       'birth_month', 'birth_week', 'ages_employed', 'employ_month',
       'employ_week', 'income_family'],
      dtype='object')

In [151]:
test.columns

Index(['gender', 'car', 'reality', 'income_total', 'income_type', 'edu_type',
       'family_type', 'house_type', 'work_phone', 'home_phone', 'email',
       'occup_type', 'family_size', 'begin_month', 'Age', 'birth_month',
       'birth_week', 'ages_employed', 'employ_month', 'employ_week',
       'income_family'],
      dtype='object')

### Encoding

In [180]:
# Separate numerical and categorical features
num = train.dtypes[train.dtypes != "object"].index.tolist()
num.remove('credit')

cat = train.dtypes[train.dtypes == "object"].index.tolist()

In [153]:
num

['income_total',
 'work_phone',
 'home_phone',
 'email',
 'family_size',
 'begin_month',
 'Age',
 'birth_month',
 'birth_week',
 'ages_employed',
 'employ_month',
 'employ_week',
 'income_family']

In [154]:
cat

['gender',
 'car',
 'reality',
 'income_type',
 'edu_type',
 'family_type',
 'house_type',
 'occup_type']

In [155]:
train.head()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,work_phone,home_phone,...,family_size,begin_month,credit,Age,birth_month,birth_week,ages_employed,employ_month,employ_week,income_family
0,F,N,N,202500.0,Commercial associate,Higher education,Married,Municipal apartment,0,0,...,2.0,6,1.0,38,7.0,1.0,12,0.0,0.0,101250.0
1,F,N,Y,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,0,1,...,2.0,37,0.0,41,10.0,3.0,5,9.0,2.0,101250.0
2,F,Y,Y,157500.0,State servant,Higher education,Married,House / apartment,0,0,...,2.0,26,2.0,41,9.0,0.0,5,10.0,0.0,78750.0
3,F,N,Y,270000.0,Working,Secondary / secondary special,Married,House / apartment,0,0,...,4.0,18,1.0,36,3.0,0.0,13,10.0,1.0,67500.0
4,F,N,N,315000.0,Working,Secondary / secondary special,Separated,House / apartment,0,0,...,1.0,41,2.0,48,9.0,2.0,5,5.0,2.0,315000.0


The result using OrdinalEncoding and LabelEncoding is exactly the same, there's no difference:

In [181]:
# OrdinalEncoding
oenc = OrdinalEncoder()
train[cat] = oenc.fit_transform(train[cat], train['credit'])
test[cat] = oenc.transform(test[cat])

In [139]:
# THIS CODE IS DEPRECATED AND SHOULD NOT BE USED ANYMORE

# LabelEncoding
for x in cat:
  lenc = LabelEncoder()
  lenc = lenc.fit(train[x])
  train[x] = lenc.transform(train[x])
  test[x] = lenc.transform(test[x])

In [182]:
# StandardScaler
scaler = StandardScaler()
train[num] = scaler.fit_transform(train[num])
test[num] = scaler.transform(test[num])

## **Model and train**

In [183]:
def stratified_kfold_rf(p, n_fold, X, y, X_test):
    # Declaring Stratified K-Fold:
    folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)
    splits = folds.split(X, y)
    
    # Initializing arrays:
    rf_oof = np.zeros((X.shape[0], 3))
    rf_preds = np.zeros((X_test.shape[0], 3))

    # Main loop:
    for fold, (train_idx, valid_idx) in enumerate(splits):
        print(f"============ Fold {fold} ============")
        
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        # Model fit:
        model = RandomForestClassifier(**p)
        model.fit(X_train, y_train)

        rf_oof[valid_idx] = model.predict_proba(X_valid) # Infers the class probability of train dataset
        rf_preds += model.predict_proba(X_test) / n_fold # Infers the class probability of test dataset
        print(f"Log Loss Score: {log_loss(y_valid, rf_oof[valid_idx]):.5f}")

    log_score = log_loss(y, rf_oof)
    print(f"\nLog Loss Score: {log_score:.5f}")

    return rf_oof, rf_preds

In [184]:
p = {
  "criterion": "entropy",
  "n_estimators": 300,
  "min_samples_split": 10,
  "min_samples_leaf": 2,
  "max_features": "sqrt",
  "oob_score": True,
  "random_state": 42,
  "n_jobs": -1,
  }

In [185]:
X = train.drop("credit", axis=1)
y = train["credit"]
X_test = test.copy()

In [186]:
rf_oof, rf_preds = stratified_kfold_rf(p, 10, X, y, X_test)

Log Loss Score: 0.69228
Log Loss Score: 0.69004
Log Loss Score: 0.67080
Log Loss Score: 0.67265
Log Loss Score: 0.69034
Log Loss Score: 0.67926
Log Loss Score: 0.70519
Log Loss Score: 0.68649
Log Loss Score: 0.68658
Log Loss Score: 0.69597

Log Loss Score: 0.68696


In [187]:
rf_preds.shape

(10000, 3)

In [188]:
rf_oof.shape

(23392, 3)

## **Save results**

In [189]:
submission = pd.read_csv("../data/sample_submission.csv")
submission.set_index("index", inplace=True)
submission[:] = rf_preds
submission.head()

Unnamed: 0_level_0,0,1,2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26457,0.095201,0.193054,0.711744
26458,0.291735,0.244377,0.463888
26459,0.048889,0.093058,0.858053
26460,0.087341,0.088834,0.823825
26461,0.113591,0.170563,0.715846
