In [41]:
#Loading Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from costcla.models import CostSensitiveRandomForestClassifier
from costcla.models import CostSensitiveRandomPatchesClassifier
from costcla.metrics import savings_score
from sys import stdout
import time


In [2]:
# Loading dataset
dfs=pd.read_csv('creditcard.csv')

In [3]:
# Separating Class, features and indices
y_all=dfs["Class"]
X_all=dfs.iloc[:,0:30]
indices=np.arange(dfs.shape[0])

In [4]:
# Reserve 20% of data for final performance test
X_train,X_test,Y_train,Y_test,indices_train,indices_test=train_test_split(X_all,y_all,indices,stratify=y_all,random_state=814,test_size=0.20)

In [51]:
def runForests(seed):
    rf= RandomForestClassifier(random_state=seed).fit(X_train, Y_train)
    y_pred_test_rf=rf.predict(X_test)
    np.random.seed(seed)
    f= CostSensitiveRandomForestClassifier()
    csdt= f.fit(np.array(X_train), np.array(Y_train), np.array(cond(X_train)))
    y_pred_test_csdt=csdt.predict(np.array(X_test))
    # Savingsusing only RandomForest
    rf_savings=(savings_score(np.array(Y_test), np.array(y_pred_test_rf), np.array(cond(X_test))))
    # Savingsusing CostSensitiveRandomForestClassifier
    csdt_savings=(savings_score(np.array(Y_test), np.array(y_pred_test_csdt), np.array(cond(X_test))))
    return((seed,rf,csdt,rf_savings,csdt_savings))

def runPForests(seed):
    rf= RandomForestClassifier(random_state=seed).fit(X_train, Y_train)
    y_pred_test_rf=rf.predict(X_test)
    np.random.seed(seed)
    f= CostSensitiveRandomPatchesClassifier()
    csdt= f.fit(np.array(X_train), np.array(Y_train), np.array(cond(X_train)))
    y_pred_test_csdt=csdt.predict(np.array(X_test))
    # Savingsusing only RandomForest
    rf_savings=(savings_score(np.array(Y_test), np.array(y_pred_test_rf), np.array(cond(X_test))))
    # Savingsusing CostSensitiveRandomForestClassifier
    csdt_savings=(savings_score(np.array(Y_test), np.array(y_pred_test_csdt), np.array(cond(X_test))))
    return((seed,rf,csdt,rf_savings,csdt_savings))

def runPrunedForests(seed):
    np.random.seed(seed)
    rfm= CostSensitiveRandomForestClassifier(pruned=True)
    rf=rfm.fit(np.array(X_train), np.array(Y_train), np.array(cond(X_train)))
    y_pred_test_rf=rf.predict(np.array(X_test))
    np.random.seed(seed)
    f= CostSensitiveRandomPatchesClassifier(pruned=True)
    csdt= f.fit(np.array(X_train), np.array(Y_train), np.array(cond(X_train)))
    y_pred_test_csdt=csdt.predict(np.array(X_test))
    # Savingsusing only RandomForest
    rf_savings=(savings_score(np.array(Y_test), np.array(y_pred_test_rf), np.array(cond(X_test))))
    # Savingsusing CostSensitiveRandomForestClassifier
    csdt_savings=(savings_score(np.array(Y_test), np.array(y_pred_test_csdt), np.array(cond(X_test))))
    return((seed,rf,csdt,rf_savings,csdt_savings))

class Timer:
    def __enter__(self):
        self.start = time.clock()
        return self

    def __exit__(self, *args):
        self.end = time.clock()
        self.interval = self.end - self.start

def fpfncost(X,i):

    return([1,X[i,np.shape(X)[1]-1],0,0])

def cond(X):
    X=np.array(X)
#    return np.array(([fpfncost(X,i) for i in range(len(X.index))]))
    return np.array(([fpfncost(X,i) for i in range(np.shape(X)[0])]))


In [26]:
runs=[]
for seed in range(1,11):
    print(seed)
    with Timer() as t:
        runs.append(runForests(seed))
    print("\t \t took " + str(int(t.interval))+ " seconds")
    stdout.flush()

1
	 	 took 258 seconds
2
	 	 took 295 seconds
3
	 	 took 294 seconds
4
	 	 took 254 seconds
5
	 	 took 279 seconds
6
	 	 took 270 seconds
7
	 	 took 282 seconds
8
	 	 took 292 seconds
9
	 	 took 254 seconds
10
	 	 took 305 seconds


In [30]:
import pandas as pd
results=pd.DataFrame([(seed,rfs,csdts) for (seed,rf,csdt,rfs,csdts) in runs],columns=['seeds','rf','csdt'])

In [31]:
results

Unnamed: 0,seeds,rf,csdt
0,1,0.688503,0.673393
1,2,0.789606,0.693814
2,3,0.617953,0.687842
3,4,0.756307,0.775901
4,5,0.707602,0.776067
5,6,0.768084,0.776232
6,7,0.694145,0.708677
7,8,0.688503,0.774255
8,9,0.754487,0.775984
9,10,0.688338,0.755728


In [38]:
rfm=results.rf.mean()
rfstd=results.rf.std()
csdtm=results.csdt.mean()
csdtstd=results.csdt.std()

pd.DataFrame(data=[(rfm,rfstd),(csdtm,csdtstd)])

Unnamed: 0,0,1
0,0.715353,0.051284
1,0.739789,0.043319


In [44]:
runsP=[]
for seed in range(1,11):
    print(seed)
    with Timer() as t:
        runsP.append(runPForests(seed))
    print("\t \t took " + str(int(t.interval))+ " seconds")
    stdout.flush()

1
	 	 took 370 seconds
2
	 	 took 417 seconds
3
	 	 took 366 seconds
4
	 	 took 398 seconds
5
	 	 took 407 seconds
6
	 	 took 387 seconds
7
	 	 took 420 seconds
8
	 	 took 348 seconds
9
	 	 took 422 seconds
10
	 	 took 391 seconds


In [45]:
results=pd.DataFrame([(seed,rfs,csdts) for (seed,rf,csdt,rfs,csdts) in runsP],columns=['seeds','rf','csdt'])
results

Unnamed: 0,seeds,rf,csdt
0,1,0.688503,0.790764
1,2,0.789606,0.671104
2,3,0.617953,0.776149
3,4,0.756307,0.671187
4,5,0.707602,0.670638
5,6,0.768084,0.775984
6,7,0.694145,0.671104
7,8,0.688503,0.775736
8,9,0.754487,0.790764
9,10,0.688338,0.775984


In [46]:
rfm=results.rf.mean()
rfstd=results.rf.std()
csdtm=results.csdt.mean()
csdtstd=results.csdt.std()

pd.DataFrame(data=[(rfm,rfstd),(csdtm,csdtstd)])

Unnamed: 0,0,1
0,0.715353,0.051284
1,0.736941,0.057032


In [52]:
runsPruned=[]
for seed in range(1,11):
    print(seed)
    with Timer() as t:
        rp=runPrunedForests(seed)
    runsPruned.append(rp)
    print("\t \t took " + str(int(t.interval))+ " seconds")
    stdout.flush()

1
	 	 took 597 seconds
2
	 	 took 683 seconds
3
	 	 took 627 seconds
4
	 	 took 625 seconds
5
	 	 took 666 seconds
6
	 	 took 637 seconds
7
	 	 took 685 seconds
8
	 	 took 622 seconds
9
	 	 took 656 seconds
10
	 	 took 655 seconds


In [53]:
results=pd.DataFrame([(seed,rfs,csdts) for (seed,rf,csdt,rfs,csdts) in runsPruned],columns=['seeds','rf','csdt'])
results

Unnamed: 0,seeds,rf,csdt
0,1,0.673393,0.790764
1,2,0.693814,0.671104
2,3,0.687842,0.776149
3,4,0.775901,0.671187
4,5,0.776067,0.670638
5,6,0.776232,0.775984
6,7,0.708677,0.671104
7,8,0.774255,0.775736
8,9,0.775984,0.790764
9,10,0.755728,0.775984


In [54]:
rfm=results.rf.mean()
rfstd=results.rf.std()
csdtm=results.csdt.mean()
csdtstd=results.csdt.std()

pd.DataFrame(data=[(rfm,rfstd),(csdtm,csdtstd)])

Unnamed: 0,0,1
0,0.739789,0.043319
1,0.736941,0.057032
