In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import math

In [2]:
data_dir = './data/train/'
output_dir = './data/fixed_train1/'

In [3]:
def load_strip_pkl(path: str)-> pd.DataFrame: 
    return pd.read_pickle(path)

def save_strip_pkl(data: pd.DataFrame, outdir: str, outname: str):
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fullname = os.path.join(outdir, outname)    

    data.to_pickle(fullname)

def load_strip_csv(path: str)-> pd.DataFrame: 
    return pd.read_csv(path, sep=',')

def save_strip_csv(data: pd.DataFrame, outdir: str, outname: str):
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fullname = os.path.join(outdir, outname)    

    data.to_csv(fullname, sep=',')

In [4]:
# Read tranformed data
df_main = load_strip_pkl(output_dir+'strip_5_train.pkl')
for i in range(1, 23):
    df_main.append(load_strip_pkl(output_dir + 'strip_%i_train.pkl' % i))

In [5]:
# format table to use it for classifier
def split_data(df: pd.DataFrame) -> pd.DataFrame:
    columns = ["ax", "ay", "az", "gx", "gy", "gz", "mx", "my", "mz", "r"]
    newColumns = range(1,16)
    df2 = pd.DataFrame()
    for c in columns:
        cols = list(map(lambda x: c+str(x), newColumns))
        new_df = pd.DataFrame(df[c].to_list(), columns=cols)
        df2 = pd.concat([df2, new_df], axis=1)
    # add near column 
    df2['near'] = pd.DataFrame(df_main['near'].values.tolist()).agg('max', axis=1)
    return df2


In [6]:
df_main_splittet = split_data(df_main)
df_main_splittet.fillna(df_main_splittet.max(), inplace=True)

In [7]:
 df_main_splittet['near'].max()

1.0

In [8]:
def get_train_test_data(strip: pd.DataFrame):
    X = strip.drop('near',axis = 1)
    Y = strip['near']
    X.fillna(X.mean())
    Y.fillna(Y.mean())
    return train_test_split(X, Y, test_size=0.30, random_state=42)

def scale_data(data: list):
    sc = StandardScaler()
    sc.fit(data[0])
    sc.fit(data[1])
    data[0] = sc.transform(data[0])
    data[1] = sc.transform(data[1])

def apply_forest(forest: RandomForestClassifier, train_data: list):
    forest.fit(train_data[0], train_data[2])

In [9]:
train_data_main = get_train_test_data(df_main_splittet) #X_train, X_test, Y_train, Y_test
scale_data(train_data_main)

In [10]:
#Random Forest
forest = RandomForestClassifier(n_estimators=200, random_state = 0)
apply_forest(forest, train_data_main)

In [11]:
forest_prediction = forest.predict(train_data_main[1])

In [12]:
#accuracy_score(train_data_main[3], forest_prediction)
roc_auc_score(train_data_main[3], forest_prediction)

0.9873201310728024