In [1]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd

# Set the parameters by cross-validation for RF
tuned_parameters = {'n_estimators': [200, 300], 
                    'min_samples_split': [0.01, 0.05, 0.1],
                    'min_samples_leaf': [0.01, 0.005, 0.1]}

run_time = 10

# Iris Dataset

In [13]:
# Load dataset
from sklearn import datasets
iris = datasets.load_iris()
X = iris['data']
Y = iris['target']

# Define ratio between testing and training data
rtt = 0.1 # test-to-train ratio
test_size = rtt * 1 / (1+rtt) 

acc_test_list = np.zeros(run_time)
acc_train_list = np.zeros(run_time)

for idx_run in tqdm(range(run_time)):
    # Split data
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1509+idx_run)
    train_idx, test_idx = next(sss.split(X, Y))
    X_train = X[train_idx]
    Y_train = Y[train_idx]
    X_test = X[test_idx]
    Y_test = Y[test_idx]

    # Cross validate to find best hyper parameters
    clf = GridSearchCV(RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509), 
                       tuned_parameters, cv=5, verbose=1)
    clf.fit(X_train, Y_train)

    # Train again with best hyperparameter
    clf = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509, 
                                 min_samples_leaf = clf.best_params_['min_samples_leaf'],
                                 n_estimators = clf.best_params_['n_estimators'],
                                 min_samples_split = clf.best_params_['min_samples_split'])
    clf.fit(X_train, Y_train)

    # Prediction
    Y_pred_test = clf.predict(X_test)
    Y_pred_train = clf.predict(X_train)
    acc_test = accuracy_score(Y_test, Y_pred_test)
    acc_train = accuracy_score(Y_train, Y_pred_train)
    acc_test_list[idx_run] = acc_test
    acc_train_list[idx_run] = acc_train

  0%|          | 0/10 [00:00<?, ?it/s]

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   39.1s finished
 10%|█         | 1/10 [00:40<06:01, 40.17s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   37.9s finished
 20%|██        | 2/10 [01:18<05:17, 39.74s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   38.1s finished
 30%|███       | 3/10 [01:57<04:36, 39.48s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   38.3s finished
 40%|████      | 4/10 [02:36<03:56, 39.36s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   38.1s finished
 50%|█████     | 5/10 [03:15<03:16, 39.24s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   38.1s finished
 60%|██████    | 6/10 [03:55<02:36, 39.24s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   38.3s finished
 70%|███████   | 7/10 [04:34<01:57, 39.21s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   38.3s finished
 80%|████████  | 8/10 [05:13<01:18, 39.17s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   38.2s finished
 90%|█████████ | 9/10 [05:52<00:39, 39.13s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   38.1s finished
100%|██████████| 10/10 [06:31<00:00, 39.12s/it]


In [17]:
np.mean(acc_test_list), np.var(acc_train_list)

(0.9571428571428573, 6.541955017301036e-05)

# Planning Relax

In [19]:
data = pd.read_csv('Dataset/planning_dataset.txt', sep="\t", header=None)
data.columns = [f"f{x}" for x in range(14)]
data = data.drop(columns=['f13']) # last column is redundant
data.f12 = data.f12.astype('int64')

In [20]:
X = data.iloc[:,:-1].to_numpy()
Y = data.iloc[:,-1].to_numpy()

In [21]:
X.shape, Y.shape

((182, 12), (182,))

In [22]:
np.sum(Y-1)

52

In [7]:
rtt = 0.1
test_size = rtt * 1 / (1+rtt) 

# Split data
sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1509)
train_idx, test_idx = next(sss.split(X, Y))
X_train = X[train_idx]
Y_train = Y[train_idx]
X_test = X[test_idx]
Y_test = Y[test_idx]

In [8]:
X_train.shape, X_test.shape

((165, 12), (17, 12))

In [10]:
Y_test

array([1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1])

# Seeds

In [25]:
data = pd.read_csv('Dataset/seeds_dataset.txt', sep="\t", header=None)
data.columns = [f"f{x}" for x in range(8)]
data.f7 = data.f7.astype('int64')

In [27]:
X = data.iloc[:,:-1].to_numpy()
Y = data.iloc[:,-1].to_numpy()

In [28]:
X.shape, Y.shape

((210, 7), (210,))

# Sonar

In [32]:
data = pd.read_csv('Dataset/sonar_all.txt', sep=",", header=None)
data.columns = [f"f{x}" for x in range(61)]
data.f60 = data.f60.astype('category')
data.f60 = data.f60.cat.codes

In [34]:
X = data.iloc[:,:-1].to_numpy()
Y = data.iloc[:,-1].to_numpy()
X.shape, Y.shape

((208, 60), (208,))

In [36]:
Y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int8)

# Wine

In [38]:
data = pd.read_csv('Dataset/wine_data.txt', sep=",", header=None)
data.columns = [f"f{x}" for x in range(14)]
data.f0 = data.f0.astype('int64')

In [40]:
X = data.iloc[:,1:].to_numpy()
Y = data.iloc[:,0].to_numpy()
X.shape, Y.shape

((178, 13), (178,))

# Run Main

In [17]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from sklearn import datasets
import pandas as pd

# Set the parameters by cross-validation for RF
tuned_parameters = {'n_estimators': [200, 300], 
                    'min_samples_split': [0.01, 0.05, 0.1],
                    'min_samples_leaf': [0.01, 0.005, 0.1]}

run_time = 100
    
def run_iris(run_time=100):
    print('----- IRIS DATASET -----')
    # Load dataset
    iris = datasets.load_iris()
    X = iris['data']
    Y = iris['target']
    
    rtt_list = [0.1, 0.2, 0.3, 0.4, 0.5]
    mean_acc_test = np.zeros(len(rtt_list))
    mean_var_test = np.zeros(len(rtt_list))
    mean_acc_train = np.zeros(len(rtt_list))
    mean_var_train = np.zeros(len(rtt_list))
    
    for idx_rtt, rtt in enumerate(rtt_list):
        # Define ratio between testing and training data
        # rtt = 0.1 # test-to-train ratio
        print(f"Processing rtt={rtt} ...")
        test_size = rtt * 1 / (1+rtt) 

        acc_test_list = np.zeros(run_time)
        acc_train_list = np.zeros(run_time)

        for idx_run in tqdm(range(run_time)):
            # Split data
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1509+idx_run)
            train_idx, test_idx = next(sss.split(X, Y))
            X_train = X[train_idx]
            Y_train = Y[train_idx]
            X_test = X[test_idx]
            Y_test = Y[test_idx]

            # Cross validate to find best hyper parameters
            clf = GridSearchCV(RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509), 
                               tuned_parameters, cv=5, verbose=0)
            clf.fit(X_train, Y_train)

            # Train again with best hyperparameter
            clf = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509, 
                                         min_samples_leaf = clf.best_params_['min_samples_leaf'],
                                         n_estimators = clf.best_params_['n_estimators'],
                                         min_samples_split = clf.best_params_['min_samples_split'])
            clf.fit(X_train, Y_train)

            # Prediction
            Y_pred_test = clf.predict(X_test)
            Y_pred_train = clf.predict(X_train)
            acc_test = accuracy_score(Y_test, Y_pred_test)
            acc_train = accuracy_score(Y_train, Y_pred_train)
            acc_test_list[idx_run] = acc_test
            acc_train_list[idx_run] = acc_train
        m_a_test = np.mean(acc_test_list)
        m_a_train = np.mean(acc_train_list)
        m_v_test = np.var(acc_test_list)
        m_v_train = np.var(acc_train_list)
        mean_acc_test[idx_rtt] = round(m_a_test,4)
        mean_acc_train[idx_rtt] = round(m_a_train,4)
        mean_var_test[idx_rtt] = round(m_v_test,4)
        mean_var_test[idx_rtt] = round(m_v_train,4)
    print("Mean Accuracy Test")
    print(mean_acc_test)
    print("Mean Accuracy Train")
    print(mean_acc_train)
    print("Var Accuracy Test")
    print(mean_var_test)
    print("Var Accuracy Train")
    print(mean_var_train)

def run_planning(run_time=100):
    # Load dataset
    print('----- PLANNING DATASET -----')
    data = pd.read_csv('Dataset/planning_dataset.txt', sep="\t", header=None)
    data.columns = [f"f{x}" for x in range(14)]
    data = data.drop(columns=['f13']) # last column is redundant
    data.f12 = data.f12.astype('int64')
    X = data.iloc[:,:-1].to_numpy()
    Y = data.iloc[:,-1].to_numpy()

    rtt_list = [0.1, 0.2, 0.3, 0.4, 0.5]
    mean_acc_test = np.zeros(len(rtt_list))
    mean_var_test = np.zeros(len(rtt_list))
    mean_acc_train = np.zeros(len(rtt_list))
    mean_var_train = np.zeros(len(rtt_list))
    
    for idx_rtt, rtt in enumerate(rtt_list):
        # Define ratio between testing and training data
        # rtt = 0.1 # test-to-train ratio
        print(f"Processing rtt={rtt} ...")
        test_size = rtt * 1 / (1+rtt) 

        acc_test_list = np.zeros(run_time)
        acc_train_list = np.zeros(run_time)

        for idx_run in tqdm(range(run_time)):
            # Split data
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1509+idx_run)
            train_idx, test_idx = next(sss.split(X, Y))
            X_train = X[train_idx]
            Y_train = Y[train_idx]
            X_test = X[test_idx]
            Y_test = Y[test_idx]

            # Cross validate to find best hyper parameters
            clf = GridSearchCV(RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509), 
                               tuned_parameters, cv=5, verbose=0)
            clf.fit(X_train, Y_train)

            # Train again with best hyperparameter
            clf = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509, 
                                         min_samples_leaf = clf.best_params_['min_samples_leaf'],
                                         n_estimators = clf.best_params_['n_estimators'],
                                         min_samples_split = clf.best_params_['min_samples_split'])
            clf.fit(X_train, Y_train)

            # Prediction
            Y_pred_test = clf.predict(X_test)
            Y_pred_train = clf.predict(X_train)
            acc_test = accuracy_score(Y_test, Y_pred_test)
            acc_train = accuracy_score(Y_train, Y_pred_train)
            acc_test_list[idx_run] = acc_test
            acc_train_list[idx_run] = acc_train
        m_a_test = np.mean(acc_test_list)
        m_a_train = np.mean(acc_train_list)
        m_v_test = np.var(acc_test_list)
        m_v_train = np.var(acc_train_list)
        mean_acc_test[idx_rtt] = round(m_a_test,4)
        mean_acc_train[idx_rtt] = round(m_a_train,4)
        mean_var_test[idx_rtt] = round(m_v_test,4)
        mean_var_test[idx_rtt] = round(m_v_train,4)
    print("Mean Accuracy Test")
    print(mean_acc_test)
    print("Mean Accuracy Train")
    print(mean_acc_train)
    print("Var Accuracy Test")
    print(mean_var_test)
    print("Var Accuracy Train")
    print(mean_var_train)

def run_seeds(run_time=100):
    # Load dataset
    print('----- SEEDS DATASET -----')
    data = pd.read_csv('Dataset/seeds_dataset.txt', sep="\t", header=None)
    data.columns = [f"f{x}" for x in range(8)]
    data.f7 = data.f7.astype('int64')
    X = data.iloc[:,:-1].to_numpy()
    Y = data.iloc[:,-1].to_numpy()

    rtt_list = [0.1, 0.2, 0.3, 0.4, 0.5]
    mean_acc_test = np.zeros(len(rtt_list))
    mean_var_test = np.zeros(len(rtt_list))
    mean_acc_train = np.zeros(len(rtt_list))
    mean_var_train = np.zeros(len(rtt_list))
    
    for idx_rtt, rtt in enumerate(rtt_list):
        # Define ratio between testing and training data
        # rtt = 0.1 # test-to-train ratio
        print(f"Processing rtt={rtt} ...")
        test_size = rtt * 1 / (1+rtt) 

        acc_test_list = np.zeros(run_time)
        acc_train_list = np.zeros(run_time)

        for idx_run in tqdm(range(run_time)):
            # Split data
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1509+idx_run)
            train_idx, test_idx = next(sss.split(X, Y))
            X_train = X[train_idx]
            Y_train = Y[train_idx]
            X_test = X[test_idx]
            Y_test = Y[test_idx]

            # Cross validate to find best hyper parameters
            clf = GridSearchCV(RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509), 
                               tuned_parameters, cv=5, verbose=0)
            clf.fit(X_train, Y_train)

            # Train again with best hyperparameter
            clf = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509, 
                                         min_samples_leaf = clf.best_params_['min_samples_leaf'],
                                         n_estimators = clf.best_params_['n_estimators'],
                                         min_samples_split = clf.best_params_['min_samples_split'])
            clf.fit(X_train, Y_train)

            # Prediction
            Y_pred_test = clf.predict(X_test)
            Y_pred_train = clf.predict(X_train)
            acc_test = accuracy_score(Y_test, Y_pred_test)
            acc_train = accuracy_score(Y_train, Y_pred_train)
            acc_test_list[idx_run] = acc_test
            acc_train_list[idx_run] = acc_train
        m_a_test = np.mean(acc_test_list)
        m_a_train = np.mean(acc_train_list)
        m_v_test = np.var(acc_test_list)
        m_v_train = np.var(acc_train_list)
        mean_acc_test[idx_rtt] = round(m_a_test,4)
        mean_acc_train[idx_rtt] = round(m_a_train,4)
        mean_var_test[idx_rtt] = round(m_v_test,4)
        mean_var_test[idx_rtt] = round(m_v_train,4)
    print("Mean Accuracy Test")
    print(mean_acc_test)
    print("Mean Accuracy Train")
    print(mean_acc_train)
    print("Var Accuracy Test")
    print(mean_var_test)
    print("Var Accuracy Train")
    print(mean_var_train)

def run_sonar(run_time=100):
    # Load dataset
    print('----- SONAR DATASET -----')
    data = pd.read_csv('Dataset/sonar_all.txt', sep=",", header=None)
    data.columns = [f"f{x}" for x in range(61)]
    data.f60 = data.f60.astype('category')
    data.f60 = data.f60.cat.codes
    X = data.iloc[:,:-1].to_numpy()
    Y = data.iloc[:,-1].to_numpy()

    rtt_list = [0.1, 0.2, 0.3, 0.4, 0.5]
    mean_acc_test = np.zeros(len(rtt_list))
    mean_var_test = np.zeros(len(rtt_list))
    mean_acc_train = np.zeros(len(rtt_list))
    mean_var_train = np.zeros(len(rtt_list))
    
    for idx_rtt, rtt in enumerate(rtt_list):
        # Define ratio between testing and training data
        # rtt = 0.1 # test-to-train ratio
        print(f"Processing rtt={rtt} ...")
        test_size = rtt * 1 / (1+rtt) 

        acc_test_list = np.zeros(run_time)
        acc_train_list = np.zeros(run_time)

        for idx_run in tqdm(range(run_time)):
            # Split data
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1509+idx_run)
            train_idx, test_idx = next(sss.split(X, Y))
            X_train = X[train_idx]
            Y_train = Y[train_idx]
            X_test = X[test_idx]
            Y_test = Y[test_idx]

            # Cross validate to find best hyper parameters
            clf = GridSearchCV(RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509), 
                               tuned_parameters, cv=5, verbose=0)
            clf.fit(X_train, Y_train)

            # Train again with best hyperparameter
            clf = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509, 
                                         min_samples_leaf = clf.best_params_['min_samples_leaf'],
                                         n_estimators = clf.best_params_['n_estimators'],
                                         min_samples_split = clf.best_params_['min_samples_split'])
            clf.fit(X_train, Y_train)

            # Prediction
            Y_pred_test = clf.predict(X_test)
            Y_pred_train = clf.predict(X_train)
            acc_test = accuracy_score(Y_test, Y_pred_test)
            acc_train = accuracy_score(Y_train, Y_pred_train)
            acc_test_list[idx_run] = acc_test
            acc_train_list[idx_run] = acc_train
        m_a_test = np.mean(acc_test_list)
        m_a_train = np.mean(acc_train_list)
        m_v_test = np.var(acc_test_list)
        m_v_train = np.var(acc_train_list)
        mean_acc_test[idx_rtt] = round(m_a_test,4)
        mean_acc_train[idx_rtt] = round(m_a_train,4)
        mean_var_test[idx_rtt] = round(m_v_test,4)
        mean_var_test[idx_rtt] = round(m_v_train,4)
    print("Mean Accuracy Test")
    print(mean_acc_test)
    print("Mean Accuracy Train")
    print(mean_acc_train)
    print("Var Accuracy Test")
    print(mean_var_test)
    print("Var Accuracy Train")
    print(mean_var_train)
    
def run_wine(run_time=100):
    # Load dataset
    print('----- WINE DATASET -----')
    data = pd.read_csv('Dataset/wine_data.txt', sep=",", header=None)
    data.columns = [f"f{x}" for x in range(14)]
    data.f0 = data.f0.astype('int64')
    X = data.iloc[:,1:].to_numpy()
    Y = data.iloc[:,0].to_numpy()

    rtt_list = [0.1, 0.2, 0.3, 0.4, 0.5]
    mean_acc_test = np.zeros(len(rtt_list))
    mean_var_test = np.zeros(len(rtt_list))
    mean_acc_train = np.zeros(len(rtt_list))
    mean_var_train = np.zeros(len(rtt_list))
    
    for idx_rtt, rtt in enumerate(rtt_list):
        # Define ratio between testing and training data
        # rtt = 0.1 # test-to-train ratio
        print(f"Processing rtt={rtt} ...")
        test_size = rtt * 1 / (1+rtt) 

        acc_test_list = np.zeros(run_time)
        acc_train_list = np.zeros(run_time)

        for idx_run in tqdm(range(run_time)):
            # Split data
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1509+idx_run)
            train_idx, test_idx = next(sss.split(X, Y))
            X_train = X[train_idx]
            Y_train = Y[train_idx]
            X_test = X[test_idx]
            Y_test = Y[test_idx]

            # Cross validate to find best hyper parameters
            clf = GridSearchCV(RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509), 
                               tuned_parameters, cv=5, verbose=0)
            clf.fit(X_train, Y_train)

            # Train again with best hyperparameter
            clf = RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=1509, 
                                         min_samples_leaf = clf.best_params_['min_samples_leaf'],
                                         n_estimators = clf.best_params_['n_estimators'],
                                         min_samples_split = clf.best_params_['min_samples_split'])
            clf.fit(X_train, Y_train)

            # Prediction
            Y_pred_test = clf.predict(X_test)
            Y_pred_train = clf.predict(X_train)
            acc_test = accuracy_score(Y_test, Y_pred_test)
            acc_train = accuracy_score(Y_train, Y_pred_train)
            acc_test_list[idx_run] = acc_test
            acc_train_list[idx_run] = acc_train
        m_a_test = np.mean(acc_test_list)
        m_a_train = np.mean(acc_train_list)
        m_v_test = np.var(acc_test_list)
        m_v_train = np.var(acc_train_list)
        mean_acc_test[idx_rtt] = round(m_a_test,4)
        mean_acc_train[idx_rtt] = round(m_a_train,4)
        mean_var_test[idx_rtt] = round(m_v_test,4)
        mean_var_test[idx_rtt] = round(m_v_train,4)
    print("Mean Accuracy Test")
    print(mean_acc_test)
    print("Mean Accuracy Train")
    print(mean_acc_train)
    print("Var Accuracy Test")
    print(mean_var_test)
    print("Var Accuracy Train")
    print(mean_var_train)


In [34]:
def run_planning(run_time=100):
    # Load dataset
    print('----- PLANNING DATASET -----')
    data = pd.read_csv('Dataset/planning_dataset.txt', sep="\t", header=None)
    data.columns = [f"f{x}" for x in range(14)]
    data = data.drop(columns=['f13']) # last column is redundant
    data.f12 = data.f12.astype('int64')
    X = data.iloc[:,:-1].to_numpy()
    Y = data.iloc[:,-1].to_numpy()

    rtt_list = [0.1, 0.2, 0.3, 0.4, 0.5]
    mean_acc_test = np.zeros(len(rtt_list))
    mean_var_test = np.zeros(len(rtt_list))
    mean_acc_train = np.zeros(len(rtt_list))
    mean_var_train = np.zeros(len(rtt_list))
    
    for idx_rtt, rtt in enumerate(rtt_list):
        # Define ratio between testing and training data
        # rtt = 0.1 # test-to-train ratio
        print(f"Processing rtt={rtt} ...")
        test_size = rtt * 1 / (1+rtt) 

        acc_test_list = np.zeros(run_time)
        acc_train_list = np.zeros(run_time)

        for idx_run in tqdm(range(run_time)):
            # Split data
            sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1509+idx_run)
            train_idx, test_idx = next(sss.split(X, Y))
            X_train = X[train_idx]
            Y_train = Y[train_idx]
            X_test = X[test_idx]
            Y_test = Y[test_idx]

            # Cross validate to find best hyper parameters
            clf = RandomForestClassifier(n_estimators=300, n_jobs=-1,random_state=1509, 
                                         min_samples_split=0.1, min_samples_leaf=0.05)
            clf.fit(X_train, Y_train)

            # Prediction
            Y_pred_test = clf.predict(X_test)
            Y_pred_train = clf.predict(X_train)
            acc_test = accuracy_score(Y_test, Y_pred_test)
            acc_train = accuracy_score(Y_train, Y_pred_train)
            acc_test_list[idx_run] = acc_test
            acc_train_list[idx_run] = acc_train
        m_a_test = np.mean(acc_test_list)
        m_a_train = np.mean(acc_train_list)
        m_v_test = np.var(acc_test_list)
        m_v_train = np.var(acc_train_list)
        mean_acc_test[idx_rtt] = round(m_a_test,4)
        mean_acc_train[idx_rtt] = round(m_a_train,4)
        mean_var_test[idx_rtt] = round(m_v_test,4)
        mean_var_test[idx_rtt] = round(m_v_train,4)
    print("Mean Accuracy Test")
    print(mean_acc_test)
    print("Mean Accuracy Train")
    print(mean_acc_train)
    print("Var Accuracy Test")
    print(mean_var_test)
    print("Var Accuracy Train")
    print(mean_var_train)

In [36]:
run_planning(100)

  0%|          | 0/100 [00:00<?, ?it/s]

----- PLANNING DATASET -----
Processing rtt=0.1 ...


100%|██████████| 100/100 [01:00<00:00,  1.65it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Processing rtt=0.2 ...


100%|██████████| 100/100 [01:00<00:00,  1.64it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Processing rtt=0.3 ...


100%|██████████| 100/100 [01:00<00:00,  1.65it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Processing rtt=0.4 ...


100%|██████████| 100/100 [01:00<00:00,  1.66it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

Processing rtt=0.5 ...


100%|██████████| 100/100 [01:00<00:00,  1.65it/s]

Mean Accuracy Test
[0.7059 0.7084 0.7124 0.7138 0.7175]
Mean Accuracy Train
[0.7293 0.744  0.7642 0.7637 0.7649]
Var Accuracy Test
[0.0001 0.0003 0.0003 0.0005 0.0005]
Var Accuracy Train
[0. 0. 0. 0. 0.]



