In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import pandas as pd

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

Loading the samples

In [2]:
samples=[]
for i in range(0,6):
    filename=open('SampleNo.'+str(i),'rb')
    samples.append(pickle.load(filename)) 
    filename.close()

In [3]:
samples[0].head()

Unnamed: 0,0,Class,1,2,3,4,5,6,7,8,...,66635,66636,66637,66638,66639,66640,66641,66642,66643,66644
4807,0.219468,2,0.0,0.0,0.0,1.8e-05,0.0,0.0,0.0,0.0,...,0.00777,0.005629,0.010226,0.010028,0.003931,0.003401,0.0,0.03442,0.000489,0.024073
909,0.094349,5,0.000497,0.000524,9.7e-05,0.000162,0.002755,0.065574,0.022222,0.001355,...,0.004538,0.005114,0.002429,0.003818,0.002409,0.00051,0.0,0.016304,0.0,0.010176
7649,0.007298,1,0.000497,0.000524,0.000193,3.6e-05,0.002755,0.016393,0.005556,0.0,...,0.005783,0.002042,0.007231,0.003416,0.002476,0.006943,0.0,0.041667,0.004618,0.01521
3903,0.053086,9,0.00149,0.001748,0.00087,0.000126,0.022039,0.131148,0.072222,0.01355,...,0.004058,0.00103,0.006124,0.000824,0.00839,0.003427,0.0,0.012681,0.0,0.021337
5385,0.150254,3,0.002732,0.002273,0.001353,0.000198,0.022039,0.180328,0.072222,0.017615,...,0.000908,0.001679,0.001254,0.004803,0.001255,0.003452,0.0,0.021739,0.0,0.004267


Initializing a dictionary where each key is feature number and value is 
- the number of times it's feature importance is greater than one for all models
- the sum of feature importances across all models

In [4]:
feature_count={}
featureimportances_sum={}

for i in range(0,66645):
    feature_count[str(i)]=0
    featureimportances_sum[str(i)]=0

In [5]:
for x,dataset in enumerate(samples):
    #train test splitting
    print("----------------Sample No.",x+1,"----------------")
    result_y=dataset['Class']
    dataset=dataset.drop(['Class'],axis=1)
    X_train, X_test, y_train, y_test = train_test_split(dataset, result_y,stratify=result_y,test_size=0.20)
    X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train,stratify=y_train,test_size=0.20)
    
    clf=RandomForestClassifier()
    clf.fit(X_train,y_train)
    
    usefulfeatures=[]
    for i,j in enumerate(clf.feature_importances_):
        if j>0:
            usefulfeatures.append(str(i))
    X_train=X_train[usefulfeatures]
    X_cv=X_cv[usefulfeatures]
    X_test=X_test[usefulfeatures]
    
    #using Randomized Search to find best parameters
    clf=RandomForestClassifier()
    prams={
     'n_estimators':[100,200,500,1000,2000],
     'max_depth':[3,5,10],
    'min_samples_split':[2,3,4]
    }
    random_cfl=RandomizedSearchCV(clf,param_distributions=prams,n_jobs=-1,)
    random_cfl.fit(X_train, y_train)
    
    print(random_cfl.best_params_)
    
    #using best parameters and fitting a model on it
    clf=RandomForestClassifier(n_estimators=random_cfl.best_params_['n_estimators'],\
                               max_depth=random_cfl.best_params_['max_depth'],\
                               min_samples_split=random_cfl.best_params_['min_samples_split'])
    clf.fit(X_train,y_train)
    
    print("Train Logloss for model on sample no.",x+1,"is",log_loss(y_train,clf.predict_proba(X_train)))
    print("CV Logloss for model on sample no.",x+1,"is",log_loss(y_cv,clf.predict_proba(X_cv)))
    print("Test Logloss for model on sample no.",x+1,"is",log_loss(y_test,clf.predict_proba(X_test)),"\n")
    
    #Logloss on treating other samples as test data
    loglossonsamples=0
    for i in range(0,6):
        if x==i:
            continue
        loglossonsamples+=log_loss(samples[i]['Class'],clf.predict_proba(samples[i][usefulfeatures]))
    loglossonsamples/=5
    print("Logloss on treating other samples as test data is: ",loglossonsamples,"\n")
    
    for i,j in enumerate(clf.feature_importances_):
        feature_count[usefulfeatures[i]]+=j
        featureimportances_sum[usefulfeatures[i]]+=1

----------------Sample No. 1 ----------------
{'n_estimators': 500, 'min_samples_split': 4, 'max_depth': 10}
Train Logloss for model on sample no. 1 is 0.08693188083867316
CV Logloss for model on sample no. 1 is 0.22614208635596778
Test Logloss for model on sample no. 1 is 0.3037386116046462 

Logloss on treating other samples as test data is:  0.3376385517697295 

----------------Sample No. 2 ----------------
{'n_estimators': 2000, 'min_samples_split': 4, 'max_depth': 3}
Train Logloss for model on sample no. 2 is 0.7891394455788866
CV Logloss for model on sample no. 2 is 0.881235732531081
Test Logloss for model on sample no. 2 is 0.915929471312762 

Logloss on treating other samples as test data is:  0.9180803807344485 

----------------Sample No. 3 ----------------
{'n_estimators': 1000, 'min_samples_split': 2, 'max_depth': 5}
Train Logloss for model on sample no. 3 is 0.34033763119488675
CV Logloss for model on sample no. 3 is 0.527963441295471
Test Logloss for model on sample no. 3

In [6]:
featureimportances_sum_array=np.zeros(66645)

for i in range(0,66645):
    featureimportances_sum_array[i]=featureimportances_sum[str(i)]
featureimportances_sum_array.sort()

In [7]:
feature_count_array=np.zeros(66645)

for i in range(0,66645):
    feature_count_array[i]=feature_count[str(i)]
feature_count_array.sort()

No. of features with Non-Zero feature importance:

In [8]:
len(np.nonzero(featureimportances_sum_array)[0])

6195

Traditional way of Random Forest Feature Selection
- Fit a model onto a sample of data
- Select features on basis of feature importances

In [9]:
#As we already have 6 samples, let's fit model on one of the sample
dataset=samples[3]

result_y=dataset['Class']
dataset=dataset.drop(['Class'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(dataset, result_y,stratify=result_y,test_size=0.20)
X_train, X_cv, y_train, y_cv = train_test_split(X_train, y_train,stratify=y_train,test_size=0.20)
    
clf=RandomForestClassifier()
clf.fit(X_train,y_train)

RandomForestClassifier()

In [10]:
traditional_way_importance={}
for i,j in enumerate(clf.feature_importances_):
    traditional_way_importance[i]=j

In [11]:
traditional_way_importance_array=np.zeros(66645)

for i in range(0,66645):
    traditional_way_importance_array[i]=traditional_way_importance[i]
traditional_way_importance_array.sort()

In [12]:
len(np.nonzero(traditional_way_importance_array)[0])

1366

# Real Feature Importances
- Obtained by fitting model onto 60% of whole dataset

In [13]:
#loading the real feature importances
realfeatureimportances=pickle.load(open("featureimportances",'rb'))

In [14]:
realfeatureimportances_array=np.zeros(66645)
for i in range(0,66645):
    realfeatureimportances_array[i]=realfeatureimportances[i]
realfeatureimportances_array.sort()

In [15]:
len(np.nonzero(realfeatureimportances_array)[0])

19882

## How many of the features with importance more than zero are important when we consider the whole dataset?

In [16]:
correct=0
for i in realfeatureimportances:
    if featureimportances_sum[str(i)]>0:
        if realfeatureimportances[i]>0:
            correct+=1
print(correct,"features from our samples are present in the real model")

2854 features from our samples are present in the real model


In [17]:
correct=0
for i in realfeatureimportances:
    if traditional_way_importance[i]>0:
        if realfeatureimportances[i]>0:
            correct+=1
print(correct,"features from samples in traditional way are present in the real model")

751 features from samples in traditional way are present in the real model


# Comparison of top 500 features
- How many of the top 500 features are present in top 500 features of real model?

### Summation of feature importances

In [18]:
correct=0
for i in realfeatureimportances:
    if realfeatureimportances[i]>=realfeatureimportances_array[-500]:
        if featureimportances_sum[str(i)]>featureimportances_sum_array[-500]:
            correct+=1
print("No. of our selected features in top 500 features of real model: ",correct)

No. of our selected features in top 500 features of real model:  123


### Count based

In [19]:
correct=0
for i in realfeatureimportances:
    if realfeatureimportances[i]>=realfeatureimportances_array[-500]:
        if feature_count[str(i)]>feature_count_array[-500]:
            correct+=1
print("No. of our selected features in top 500 features of real model: ",correct)

No. of our selected features in top 500 features of real model:  220


### Traditional way

In [20]:
correct=0
for i in realfeatureimportances:
    if realfeatureimportances[i]>=realfeatureimportances_array[-500]:
        if traditional_way_importance[i]>traditional_way_importance_array[-500]:
            correct+=1
print("No. of our selected features in top 500 features of real model: ",correct)

No. of our selected features in top 500 features of real model:  116
