In [18]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import os

In [19]:
outname = 'trans1.csv'
outdir = './data/transformed/'

In [20]:
def save_strip(data: pd.DataFrame, outdir: str, file: str):
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fullname = os.path.join(outdir, outname)    

    data.to_csv(fullname)

def load_strip(path: str)-> pd.DataFrame: 
    return pd.read_csv(path, sep=',')

def init_strip(strip: str)->pd.DataFrame:
    #if not os.path.exists(outdir+outname):
    print("Recreate transformed strip")
    strip = pd.read_csv(strip, sep=',')

    """print(strip.head(n=1))

    print(strip['frame_number'].max())
    print(strip['run_number'].max())"""
    strip2 = strip.drop('strip_id',axis = 1)\
                    .drop('timestamp',axis = 1)\
                    .drop('vicon_x',axis = 1)\
                    .drop('vicon_y',axis = 1)
    strip = strip2.groupby(['run_number','frame_number']).agg(pd.Series.tolist)
    #save_strip(strip, outdir, outname)
    strip.reset_index(inplace=True)
    return strip
    """else:
        print("Read transformed strip")
        strip = load_strip(outdir+outname)
        #print(strip.head(n=100))
        return strip"""

In [21]:
# Read data and goup it
strip_3 = init_strip('data/train/strip_3_train.csv')

Recreate transformed strip


In [22]:
def add_missing_data(df: pd.DataFrame):
    tmp = df[['run_number', 'frame_number', 'node_id']]
    for index, row in tmp.iterrows():
        nodes = row['node_id']
        length = 0
        if isinstance(nodes, str):
            length = len(nodes.split(',')) # this is a pandas csv array!! (String)
        elif isinstance(nodes, list): 
            length = len(nodes)
        if length < 15:
            run = row['run_number']
            frame = row['frame_number']
            print('hier %i %i' %(run,frame))
            # TODO: add missing rows to df


In [23]:
# add missing rows to ds
add_missing_data(strip_3)

In [24]:
# format table to use it for classifier
def split_data(df: pd.DataFrame) -> pd.DataFrame:
    columns = ["ax", "ay", "az", "gx", "gy", "gz", "mx", "my", "mz", "r"]
    newColumns = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    df2 = pd.DataFrame()
    for c in columns:
        cols = list(map(lambda x: c+str(x), newColumns))
        new_df = pd.DataFrame(df[c].to_list(), columns=cols)
        df2 = pd.concat([df2, new_df], axis=1)
    # add near column 
    df2['near'] = pd.DataFrame(df['near'].values.tolist()).mean(1)
    return df2


In [25]:
strip_3_splittet = split_data(strip_3)

In [28]:
def get_train_test_data(strip: pd.DataFrame):
    X = strip2.drop('near',axis = 1)
    Y = strip2['near']
    X = X.fillna(X.mean())
    X.head()
    return train_test_split(X, Y, test_size = 0.15, random_state = 25)
def scale_data(data: list):
    sc = StandardScaler()
    sc.fit(data[0])
    sc.fit(data[1])
    data[0] = sc.transform(data[0])
    data[1] = sc.transform(data[1])
def apply_forest(forest: RandomForestClassifier, train_data: list):
    forest.fit(train_data[0], train_data[2])

In [36]:
train_data_3 = get_train_test_data(strip_3) #X_train, X_test, Y_train, Y_test
scale_data(train_data_3)

[array([[-0.18235098, -0.84369128, -1.38117179, ..., -1.08162677,
         -1.0922403 , -0.52445254],
        [ 0.60881327, -0.61224488,  0.49986923, ..., -1.08162677,
         -1.0922403 ,  0.2671038 ],
        [ 0.83486022, -0.61224488, -1.49873685, ..., -0.01195141,
         -0.00787293,  0.66288197],
        ...,
        [ 0.72183672, -0.14935213, -0.79334649, ...,  0.86324598,
          0.62588063,  1.45443831],
        [ 0.72183672,  0.1978174 , -0.79334649, ..., -0.01195141,
         -0.66271007, -0.92023071],
        [-0.40839788,  1.3550493 , -0.79334649, ..., -0.01195141,
         -0.00787293, -0.01214142]]),
 array([[ 0.83486022, -1.42230721,  1.55795479, ..., -0.01195141,
         -0.00787293, -0.12867437],
        [-0.97351518, -0.03362896,  1.79308494, ..., -0.01195141,
         -0.00787293, -0.92023071],
        [-0.29537443,  1.1236029 , -0.32308623, ..., -0.01195141,
         -0.00787293, -0.01214142],
        ...,
        [ 0.38276637, -0.84369128, -0.79334649, ...,  

In [33]:
#Random Forest
forest = RandomForestClassifier(n_estimators=200, random_state = 0)
apply_forest(forest, scaled_data_3)

In [34]:
forest_prediction = forest.predict(scaled_data_3[1])

In [35]:
#accuracy_score(Y_test, forest_prediction)
roc_auc_score(Y_test, forest_prediction)

0.9405461859817421