In [114]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import os

In [2]:
outname = 'trans1.csv'
outdir = './data/transformed/'

In [3]:
def save_strip(data: pd.DataFrame, outdir: str, file: str):
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fullname = os.path.join(outdir, outname)    

    data.to_csv(fullname)

def load_strip(path: str)-> pd.DataFrame: 
    return pd.read_csv(path)

def init_strip():
    #if not os.path.exists(outdir+outname):
    print("Recreate transformed strip")
    strip = pd.read_csv('data/train/strip_3_train.csv', sep=',')

    """print(strip.head(n=1))

    print(strip['frame_number'].max())
    print(strip['run_number'].max())"""
    strip2 = strip.drop('strip_id',axis = 1)\
                    .drop('timestamp',axis = 1)\
                    .drop('vicon_x',axis = 1)\
                    .drop('vicon_y',axis = 1)
    strip = strip2.groupby(['run_number','frame_number']).agg(pd.Series.tolist)
    #save_strip(strip, outdir, outname)
    return strip
    """else:
        print("Read transformed strip")
        strip = load_strip(outdir+outname)
        #print(strip.head(n=100))
        return strip"""

In [16]:
# Read data and goup it
strip = init_strip()
strip.reset_index(inplace=True)

Recreate transformed strip


In [17]:
def add_missing_data(df: pd.DataFrame):
    tmp = df[['run_number', 'frame_number', 'node_id']]
    for index, row in tmp.iterrows():
        nodes = row['node_id']
        length = 0
        if isinstance(nodes, str):
            length = len(nodes.split(',')) # this is a pandas csv array!! (String)
        elif isinstance(nodes, list): 
            length = len(nodes)
        if length < 15:
            run = row['run_number']
            frame = row['frame_number']
            print('hier %i %i' %(run,frame))
            # TODO: add missing rows to df


In [59]:
# add missing rows to ds
add_missing_data(strip)

In [88]:
# format table to use it for classifier
def split_data(df: pd.DataFrame):
    columns = ["ax", "ay", "az", "gx", "gy", "gz", "mx", "my", "mz", "r"]
    newColumns = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
    df2 = pd.DataFrame()
    for c in columns:
        cols = list(map(lambda x: c+str(x), newColumns))
        new_df = pd.DataFrame(df[c].to_list(), columns=cols)
        df2 = pd.concat([df2, new_df], axis=1)
    # add near column 
    df2['near'] = pd.DataFrame(df['near'].values.tolist()).mean(1)
    return df2


In [99]:
strip2 = split_data(strip)
strip2.head()

Unnamed: 0,ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9,ax10,...,r7,r8,r9,r10,r11,r12,r13,r14,r15,near
0,-0.002945,0.003222,0.001292,0.002646,9.8e-05,-0.000827,-0.003101,-0.000773,0.000917,-0.000914,...,-86.0,-93.0,-89.0,-88.0,-91.0,,-90.0,-85.0,-85.0,0.0
1,0.000717,0.001269,0.001292,-0.00297,9.8e-05,-0.003756,-0.002612,0.00118,0.001894,0.000795,...,-89.0,-94.0,-88.0,,,,,-87.0,-85.0,0.0
2,-0.003189,-0.000684,-0.003835,-0.001016,9.8e-05,0.000394,-0.001148,-0.001018,0.002626,-0.002623,...,-87.0,-91.0,-83.0,-92.0,-84.0,-87.0,,-90.0,-85.0,0.0
3,-0.000747,-0.001905,0.004466,-0.001261,-0.001367,0.001371,0.002026,0.000936,0.002138,-0.002623,...,-84.0,-90.0,-82.0,-92.0,-84.0,-85.0,,,-86.0,0.0
4,-0.002212,0.000781,-0.002126,-0.000284,-0.001367,0.001371,-0.001392,-0.004191,-0.000792,0.002504,...,-84.0,-86.0,-79.0,-94.0,-88.0,-85.0,-91.0,,-87.0,0.0


In [103]:
X = strip2.drop('near',axis = 1)
Y = strip2['near']
X = X.fillna(X.mean())
X.head()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 25)

In [106]:
sc = StandardScaler()
sc.fit(X_train)
sc.fit(X_test)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [119]:
#Random Forest
forest = RandomForestClassifier(n_estimators=200, random_state = 0)
forest.fit(X_train, Y_train)
forest_prediction = forest.predict(X_test)

In [118]:
#accuracy_score(Y_test, forest_prediction)
roc_auc_score(Y_test, forest_prediction)

0.9405461859817421