In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import os

In [2]:
outname = 'trans1.csv'
outdir = './data/transformed/'

In [3]:
def save_strip(data: pd.DataFrame, outdir: str, outname: str):
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    fullname = os.path.join(outdir, outname)    

    data.to_csv(fullname)

def load_strip(path: str)-> pd.DataFrame: 
    return pd.read_csv(path, sep=',')

def init_strip(strip: str)->pd.DataFrame:
    #if not os.path.exists(outdir+outname):
    print("Recreate transformed strip")
    strip = pd.read_csv(strip, sep=',')

    """print(strip.head(n=1))

    print(strip['frame_number'].max())
    print(strip['run_number'].max())"""
    strip2 = strip.drop('strip_id',axis = 1)\
                    .drop('timestamp',axis = 1)\
                    .drop('vicon_x',axis = 1)\
                    .drop('vicon_y',axis = 1)
    strip = strip2.groupby(['run_number','frame_number']).agg(pd.Series.tolist)
    #save_strip(strip, outdir, outname)
    strip.reset_index(inplace=True)
    return strip
    """else:
        print("Read transformed strip")
        strip = load_strip(outdir+outname)
        #print(strip.head(n=100))
        return strip"""

In [4]:
# Read data and goup it
strips = [2,3,4,5]
df_main = init_strip('data/train/strip_6_train.csv')
for s in strips:
    df_main.append(init_strip('data/train/strip_%i_train.csv' % s))

Recreate transformed strip
Recreate transformed strip
Recreate transformed strip
Recreate transformed strip
Recreate transformed strip


In [5]:
df_main.head(n=0)

Unnamed: 0,run_number,frame_number,node_id,ax,ay,az,gx,gy,gz,mx,my,mz,r,near


In [6]:
def add_missing_data(df: pd.DataFrame):
    tmp = df[['run_number', 'frame_number', 'node_id']]
    near = pd.DataFrame(df['near'].values.tolist()).mean(1)
    for index, row in tmp.iterrows():
        nodes = row['node_id']
        length = 0
        if isinstance(nodes, list): 
            length = len(nodes)
        if length < 15:
            run = row['run_number']
            frame = row['frame_number']
            print("Missing val in %i %i" % (run, frame))
            for i in range(1,16):
                if i not in nodes:
                    df.loc[-1] = [run, frame, i, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, near]


In [7]:
# add missing rows to ds
add_missing_data(df_main)

In [8]:
#save_strip(strip_3_4,'./', 'test_7.csv')

In [9]:
 #df_main = df_main.fillna(df_main.mean())

In [10]:
# format table to use it for classifier
def split_data(df: pd.DataFrame) -> pd.DataFrame:
    columns = ["ax", "ay", "az", "gx", "gy", "gz", "mx", "my", "mz", "r"]
    newColumns = range(1,16)
    df2 = pd.DataFrame()
    for c in columns:
        cols = list(map(lambda x: c+str(x), newColumns))
        new_df = pd.DataFrame(df[c].to_list(), columns=cols)
        df2 = pd.concat([df2, new_df], axis=1)
    # add near column 
    df2['near'] = pd.DataFrame(df['near'].values.tolist()).mean(1)
    return df2


In [31]:
df_main_splittet = split_data(df_main)

In [32]:
def get_train_test_data(strip: pd.DataFrame):
    X = strip.drop('near',axis = 1)
    Y = strip['near']
    X = X.fillna(X.mean())
    return train_test_split(X, Y, test_size = 0.25, random_state = 42)
def scale_data(data: list):
    sc = StandardScaler()
    sc.fit(data[0])
    sc.fit(data[1])
    data[0] = sc.transform(data[0])
    data[1] = sc.transform(data[1])
def apply_forest(forest: RandomForestClassifier, train_data: list):
    forest.fit(train_data[0], train_data[2])

In [33]:
train_data_main = get_train_test_data(df_main_splittet) #X_train, X_test, Y_train, Y_test
scale_data(train_data_main)

In [34]:
#Random Forest
forest = RandomForestClassifier(n_estimators=1000, random_state = 100)
apply_forest(forest, train_data_main)

In [35]:
forest_prediction = forest.predict(train_data_main[1])

In [36]:
#accuracy_score(Y_test, forest_prediction)
roc_auc_score(train_data_main[3], forest_prediction)

0.9445713045498453

In [None]:
save_strip(forest_prediction, './', 'predictions.csv')