# Load Train Set

features (3681) | labels (1)

In [1]:
# import pandas as pd
import cudf
import cupy
import pandas as pd
import numpy as np

# df = pd.read_feather("dts/np_dataset_train.ftr")
df = pd.read_csv("dts/np_dataset_train.csv")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Columns: 3684 entries, Unnamed: 0 to Label
dtypes: bool(1), float64(3682), int64(1)
memory usage: 1.1 GB


## train/test Split

In [2]:
from sklearn.model_selection import train_test_split

rs = 37

train, test = train_test_split(df, test_size=0.2, random_state=rs)
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32000 entries, 15893 to 34703
Columns: 3684 entries, Unnamed: 0 to Label
dtypes: bool(1), float64(3682), int64(1)
memory usage: 899.4 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 273 to 564
Columns: 3684 entries, Unnamed: 0 to Label
dtypes: bool(1), float64(3682), int64(1)
memory usage: 224.9 MB


## Classifier definition

In [3]:
from random import seed
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as tree_classifier
from scipy import stats
import cuml

# from cuml import tree_classifier


class random_forest():
    def __init__(self, number_of_trees=1, number_of_iteractions=1, max_depth=1, random_state=1):
        self.iteractions = number_of_iteractions
        self.number_of_trees = number_of_trees
        self.max_depth = max_depth
        self.forest = []
        self.random_state = random_state

    def bootstrap(self, x_samples):
        """ Create subsets of the input, and returns the indexes 
            from the original sample."""
        index = []

        sample_length = int(np.floor(len(x_samples)/self.number_of_trees))
        np.random.RandomState(self.random_state)

        index = np.random.randint(
            0, sample_length, (self.number_of_trees, sample_length))

        return index

    def predict(self, X):
        y = []
        for tree in self.forest:
            y.append(tree.predict(X))
        mode_ = stats.mode(y, axis=0)
        mode_ = np.transpose(mode_[0])
        return mode_

    def fit(self, x_samples, y_samples):
        idx = self.bootstrap(x_samples)

        for t in range(self.number_of_trees):
            tree = tree_classifier(
                criterion="gini", max_depth=self.max_depth, random_state=self.random_state)
            tree.fit(x_samples.iloc[idx[t]], y_samples[idx[t]])
            self.forest.append(tree)


In [4]:
# from sklearn import svm
# from cuml.svm import SVC
# import random_forest as rf


clf = cuml.ensemble.RandomForestClassifier(
    n_estimators=30,
    random_state=rs
)

# clf = random_forest(number_of_trees=10, number_of_iteractions=1, max_depth=1)


  return func(**kwargs)


## Sending Data to GPU

In [5]:
x = cudf.DataFrame(train.iloc[:, :-1])
x = x.astype(cupy.float32)
y = cupy.array(train.iloc[:, -1])
y = y.astype(cupy.float32)

# x = pd.DataFrame(train.iloc[:, :-1])
# y = np.array(train.iloc[:, -1])

print(x.shape)
print(y.shape)


(32000, 3683)
(32000,)


## Model Fitting

In [6]:
clf.fit(x, y)

[W] [14:48:11.195258] Using experimental backend for growing trees



RandomForestClassifier()

## Testing

In [7]:
# clf.predict_model = 'CPU'
tmp = test.iloc[:, :-1].to_numpy()
tmp = np.float32(tmp)
# x_test = cupy.DataFrame(tmp)
# print(type(tmp))
# tmp
# f32 = tmp.astype(cupy.float32)
# f32
y_pred = clf.predict(tmp)


## Evaluating

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score



accuracy = accuracy_score(y_true=test.iloc[:, -1], y_pred=y_pred)
roc = roc_auc_score(test.iloc[:, -1], y_pred)
pre = precision_score(y_true=test.iloc[:, -1], y_pred=y_pred)
rec = recall_score(y_true=test.iloc[:, -1], y_pred=y_pred)


print("Metrics: \n Acc: {acc:.2f}, ROC: {roc:.2f}\n PRE: {pre:.2f}, REC: {rec:.2f}".format(acc=accuracy,roc=roc,pre=pre,rec=rec))


Metrics: 
 Acc: 1.00, ROC: 1.00
 PRE: 1.00, REC: 1.00


## SC graph

In [9]:
accs = []
rocs = []
pres = []
recs = []


for rs in range(100):
    train, test = train_test_split(df, test_size=0.2, random_state=rs)

    clf = cuml.ensemble.RandomForestClassifier(
        n_estimators=30,
        random_state=rs
    )

    x = cudf.DataFrame(train.iloc[:, :-1])
    x = x.astype(cupy.float32)
    y = cupy.array(train.iloc[:, -1])
    y = y.astype(cupy.float32)

    clf.fit(x, y)
    x_test = test.iloc[:, :-1].to_numpy()
    x_test = np.float32(x_test)
    y_pred = clf.predict(x_test)

    accuracy = accuracy_score(y_true=test.iloc[:, -1], y_pred=y_pred)
    accs.append(accuracy)
    roc = roc_auc_score(test.iloc[:, -1], y_pred)
    rocs.append(roc)
    pre = precision_score(y_true=test.iloc[:, -1], y_pred=y_pred)
    pres.append(pre)
    rec = recall_score(y_true=test.iloc[:, -1], y_pred=y_pred)
    recs.append(rec)

    print("{i}/100".format(i=rs+1))


  return func(**kwargs)


[W] [14:48:19.055493] Using experimental backend for growing trees

1/100


  return func(**kwargs)


[W] [14:48:26.588504] Using experimental backend for growing trees

2/100


  return func(**kwargs)


[W] [14:48:34.139396] Using experimental backend for growing trees

3/100


  return func(**kwargs)


[W] [14:48:41.683730] Using experimental backend for growing trees

4/100


  return func(**kwargs)


[W] [14:48:49.476011] Using experimental backend for growing trees

5/100


  return func(**kwargs)


[W] [14:48:56.916571] Using experimental backend for growing trees

6/100


  return func(**kwargs)


In [None]:


# import matplotlib.pyplot as plt


# plt.stairs(accs)

# npAccs = np.array(accs)
# npRocs = np.array(rocs)
# npPres = np.array(pres)
# npRecs = np.array(recs)

def min_max(name,array):
    a = np.array(array)
    print("{name}: ({min:.6f},{max:.1f}|{mean:.6f})".format(
        name=name, min=np.min(a), max=np.max(a), mean=np.mean(a)))

min_max("Acc",accs)
min_max("Roc", rocs)
min_max("Pre", pres)
min_max("Rec", recs)
