come up with a wrapper to count the run time

---

In [18]:
import time, os
import pandas as pd
from pys3d import PYS3D

In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

def get_linearsvc(random_state=100, **kwargs):
    return LinearSVC(random_state=random_state)

def get_lasso(max_iter=1000, random_state=100, **kwargs):
    return SGDClassifier(loss='log', penalty='l1',
                         max_iter=max_iter, class_weight='balanced',
                         random_state=random_state, **kwargs)

def get_elasticnet(max_iter=1000, random_state=100, **kwargs):
    return SGDClassifier(loss='log', penalty='elasticnet',
                         max_iter=max_iter, class_weight='balanced',
                         random_state=random_state, **kwargs)

def get_randomforest(random_state=100, **kwargs):
    return RandomForestClassifier(random_state=random_state,
                                  class_weight='balanced',
                                  n_estimators=50, **kwargs)

---

#### bm

In [3]:
data_name = 'spambase'
clf_name = 'linearsvc'
type_ = 'classification'

In [41]:
performance_df = pd.read_csv('{}/bm-performance/{}-{}.csv'.format(type_, data_name, clf_name),
                             index_col=0)
performance_df.head(2)

Unnamed: 0,accuracy,auc_macro,auc_micro,best_params,f1_binary,f1_macro,f1_micro,fold_index,r2,threshold
0,0.872964,0.966903,0.966903,{'C': 0.5},0.856793,0.871323,0.872964,0,0.468009,-0.274318
1,0.875136,0.963847,0.963847,{'C': 0.5},0.859241,0.873523,0.875136,1,0.477102,-0.262789


In [42]:
runtime_df = list()
for i, series in performance_df.iterrows():
    fold_index, best_params_str = series[['fold_index', 'best_params']]
    best_params = eval(best_params_str)
    ## obtain data
    df = pd.read_csv('../splitted_data/%s/%s/train.csv'%(data_name, fold_index))
    X = df[df.columns[df.columns!='target']].values
    y = df['target'].values.astype(int)

    df_test = pd.read_csv('../splitted_data/%s/%s/test.csv'%(data_name, fold_index))
    X_test = df_test[df_test.columns[df_test.columns!='target']].values
    y_test = df_test['target'].values.astype(int)

    if 'forest' not in clf_name:
        # if not random forest, standardize data
        #print('standardize data for', clf_name)
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        X_test = scaler.transform(X_test)

    for _ in range(10):
        clf = eval("get_{}(**best_params)".format(clf_name))
        start_time = time.time()
        clf.fit(X_train, y_train)
        train_end_time = time.time()
        clf.predict(X_test)
        end_time = time.time()
        runtime_df.append({'data_name': data_name, 'clf_name': clf_name, 
                           'fold_index': fold_index,
                           'train_time': train_end_time-start_time,
                           'total_time': end_time-start_time})
runtime_df = pd.DataFrame(runtime_df)

##### wrapper

In [26]:
def get_runtime(data_name, clf_name, type_):
    performance_df = pd.read_csv('{}/bm-performance/{}-{}.csv'.format(type_, data_name, clf_name),
                                 index_col=0)
    runtime_df = list()
    for i, series in performance_df.iterrows():
        fold_index, best_params_str = series[['fold_index', 'best_params']]
        best_params = eval(best_params_str)
        ## obtain data
        df = pd.read_csv('../splitted_data/%s/%s/train.csv'%(data_name, fold_index))
        X = df[df.columns[df.columns!='target']].values
        y = df['target'].values.astype(int)

        df_test = pd.read_csv('../splitted_data/%s/%s/test.csv'%(data_name, fold_index))
        X_test = df_test[df_test.columns[df_test.columns!='target']].values
        y_test = df_test['target'].values.astype(int)

        if 'forest' not in clf_name:
            # if not random forest, standardize data
            #print('standardize data for', clf_name)
            scaler = StandardScaler()
            X = scaler.fit_transform(X)
            X_test = scaler.transform(X_test)

        for _ in range(10):
            clf = eval("get_{}(**best_params)".format(clf_name))
            start_time = time.time()
            clf.fit(X, y)
            #train_end_time = time.time()
            #clf.predict(X_test)
            end_time = time.time()
            runtime_df.append({'data_name': data_name, 'clf_name': clf_name, 
                               'fold_index': fold_index,
                               #'train_time': train_end_time-start_time,
                               'train_time': end_time-start_time,
                              })
                               #'total_time': end_time-start_time})
    return pd.DataFrame(runtime_df)

In [49]:
data_name = 'breastcancer'
clf_name = 'randomforest'
type_ = 'classification'

In [50]:
runtime_df = get_runtime(data_name, clf_name, type_)
runtime_df.head(2)

Unnamed: 0,clf_name,data_name,fold_index,train_time
0,randomforest,breastcancer,0,0.159778
1,randomforest,breastcancer,0,0.084936


---

#### s3d

create temporary folders for this task

In [29]:
data_name = 'spambase'
type_ = 'classification'

In [30]:
model_base_path = 'tmp-models/'
pred_base_path = 'tmp-predictions/'

In [31]:
performance_df = pd.read_csv('{}/performance/{}-test-performance.csv'.format(type_, data_name),
                             index_col=0)
performance_df.head(2)

Unnamed: 0,num_features,accuracy,auc_macro,auc_micro,f1_binary,f1_macro,f1_micro,r2,threshold,lambda_,split_version
0,3,0.844734,0.946641,0.946641,0.821026,0.841961,0.844734,0.349788,0.495327,0.003,0.0
1,3,0.85342,0.946387,0.946387,0.832298,0.851057,0.85342,0.386164,0.495327,0.003,1.0


In [32]:
runtime_df = list()
for i, series in performance_df.iterrows():
    fold_index, n_f, lambda_ = series[['split_version', 'num_features', 'lambda_']]
    fold_index = int(fold_index)
    ## obtain data
    train_data = '../splitted_data/{}/{}/train.csv'.format(data_name, fold_index)
    test_data = '../splitted_data/{}/{}/test.csv'.format(data_name, fold_index)
    ## model and prediction paths
    train_model = model_base_path+data_name+'/'
    pred_path = pred_base_path+data_name+'/'
    #if not os.path.exists(train_model):
    #    os.mkdir(train_model)
    #if not os.path.exists(pred_path):
    #    os.mkdir(pred_path)

    for _ in range(10):
        s3d = PYS3D(data_name, model_path=model_base_path,
                    prediction_path=pred_base_path)
        start_time = time.time()
        s3d.fit(train_data, train_model, lambda_=lambda_, max_features=100)
        #train_end_time = time.time()
        #s3d.predict(test_data, train_model, pred_path)
        end_time = time.time()
        runtime_df.append({'data_name': data_name, 'clf_name': clf_name, 
                           'fold_index': fold_index,
                               #'train_time': train_end_time-start_time,
                               'train_time': end_time-start_time,
                              })
        break
    break
runtime_df = pd.DataFrame(runtime_df)

...s3d initializing...
s3d with spambase data, splitted into 5 folds
data will be loaded from ../splitted_data/spambase/
built models will be saved to tmp-models/spambase/
predictions will be saved to tmp-predictions/spambase/
temporary subfolders in  ./tmp/spambase
...done initializing...





In [33]:
runtime_df

Unnamed: 0,clf_name,data_name,fold_index,train_time
0,linearsvc,spambase,0,77.036372


##### wrapper

In [43]:
def get_runtime_s3d(data_name, type_):
    model_base_path = 'tmp-models/'
    pred_base_path = 'tmp-predictions/'
    performance_df = pd.read_csv('{}/performance/{}-test-performance.csv'.format(type_, data_name),
                                 index_col=0)
    runtime_df = list()
    for i, series in performance_df.iterrows():
        fold_index, n_f, lambda_ = series[['split_version', 'num_features', 'lambda_']]
        fold_index = int(fold_index)
        ## obtain data
        train_data = '../splitted_data/{}/{}/train.csv'.format(data_name, fold_index)
        test_data = '../splitted_data/{}/{}/test.csv'.format(data_name, fold_index)
        ## model and prediction paths
        train_model = model_base_path+data_name+'/'
        pred_path = pred_base_path+data_name+'/'
        for _ in range(10):
            s3d = PYS3D(data_name, model_path=model_base_path,
                        prediction_path=pred_base_path)
            start_time = time.time()
            s3d.fit(train_data, train_model, lambda_=lambda_, max_features=100)
            #train_end_time = time.time()
            #s3d.predict(test_data, train_model, pred_path)
            end_time = time.time()
            runtime_df.append({'data_name': data_name, 'clf_name': clf_name, 
                               'fold_index': fold_index,
                               #'train_time': train_end_time-start_time,
                               'train_time': end_time-start_time,
                              })
    return pd.DataFrame(runtime_df)

In [44]:
data_name = 'breastcancer'
type_ = 'classification'

In [48]:
%%capture
runtime_df = get_runtime_s3d(data_name, type_)

In [51]:
runtime_df

Unnamed: 0,clf_name,data_name,fold_index,train_time
0,randomforest,breastcancer,0,0.159778
1,randomforest,breastcancer,0,0.084936
2,randomforest,breastcancer,0,0.079276
3,randomforest,breastcancer,0,0.078965
4,randomforest,breastcancer,0,0.079193
5,randomforest,breastcancer,0,0.078996
6,randomforest,breastcancer,0,0.063142
7,randomforest,breastcancer,0,0.06093
8,randomforest,breastcancer,0,0.05833
9,randomforest,breastcancer,0,0.05789
