In [1]:
#Abhängigkeiten
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

## Datensatz einlesen und verarbeiten

In [2]:
#Daten laden
#strip_1 enthält keine near Einträge mit 1.0?
strip_train = pd.read_csv('data/train/strip_1_train.csv', sep=',')
strip_test = pd.read_csv('data/test/strip_1_test_no_labels.csv', sep=',')
strip_offset = 0
strip_train.head(32)

Unnamed: 0,frame_number,strip_id,node_id,timestamp,run_number,ax,ay,az,gx,gy,gz,mx,my,mz,r,near,vicon_x,vicon_y
0,0,1,1,2020-08-05 14:01:18.978,0,0.001908,0.003762,0.001725,-0.09547,0.20146,0.157034,0.973723,-0.222346,0.60043,-90.0,0.0,-9.986354,4.972623
1,0,1,2,2020-08-05 14:01:18.928,0,-0.002198,-0.001227,-0.001228,0.144478,-0.111786,0.14408,0.534286,-0.046543,-0.485943,-82.0,0.0,-9.986354,4.972623
2,0,1,3,2020-08-05 14:01:19.039,0,-0.002835,0.000301,0.001429,-0.125374,-0.02573,-0.060134,-0.316322,-0.835743,1.082731,-80.0,0.0,-9.986354,4.972623
3,0,1,4,2020-08-05 14:01:19.083,0,0.004276,0.001683,-0.003767,0.011404,-0.073312,-0.008761,1.389954,0.655827,0.702555,-83.0,0.0,-9.986354,4.972623
4,0,1,5,2020-08-05 14:01:19.036,0,-0.002239,7.8e-05,0.003248,-0.239782,-0.001009,-0.190354,-1.76762,1.023356,0.872065,-93.0,0.0,-9.986354,4.972623
5,0,1,6,2020-08-05 14:01:19.025,0,-0.00473,0.004168,-0.001643,-0.030919,-0.2058,0.080565,-0.655654,-0.663519,0.699971,-86.153846,0.0,-9.986354,4.972623
6,0,1,7,2020-08-05 14:01:18.916,0,0.003577,0.001087,0.001017,0.00434,0.108771,0.097796,-0.322847,0.380109,-0.916906,-86.153846,0.0,-9.986354,4.972623
7,0,1,8,2020-08-05 14:01:18.932,0,-0.000373,0.000294,-0.002959,0.214277,-0.17846,0.10909,1.134845,1.595494,-0.115184,-82.0,0.0,-9.986354,4.972623
8,0,1,9,2020-08-05 14:01:18.991,0,0.000106,0.000854,-0.000679,0.004342,-0.155267,-0.24163,0.003815,0.666138,-1.036205,-88.0,0.0,-9.986354,4.972623
9,0,1,10,2020-08-05 14:01:18.974,0,0.001783,0.002098,0.003226,0.121864,-0.012891,-0.05959,-0.025617,-0.303184,0.098795,-89.0,0.0,-9.986354,4.972623


### Trainingsdaten

In [3]:
#TODO simplify
#Label aus Datensatz entfernen, damit wir einen Label-Satz und einen Feature-Satz erhalten.
X_train = strip_train.drop('near',axis = 1)
Y_train = strip_train['near']

#Alle nicht Sensordaten entfernen
X_train = X_train.drop('frame_number',axis = 1)
X_train = X_train.drop('strip_id',axis = 1)
X_train = X_train.drop('node_id',axis = 1)
X_train = X_train.drop('timestamp',axis = 1)
X_train = X_train.drop('run_number',axis = 1)
X_train = X_train.drop('vicon_x',axis = 1)
X_train = X_train.drop('vicon_y',axis = 1)

print("r " + str(X_train['r'].isnull().values.any()))
print("ax " + str(X_train['ax'].isnull().values.any()))
print("ay " + str(X_train['ay'].isnull().values.any()))
print("az " + str(X_train['az'].isnull().values.any()))
print("gx " + str(X_train['gx'].isnull().values.any()))
print("gy " + str(X_train['gy'].isnull().values.any()))
print("gz " + str(X_train['gz'].isnull().values.any()))
print("mx " + str(X_train['mx'].isnull().values.any()))
print("my " + str(X_train['my'].isnull().values.any()))
print("mz " + str(X_train['mz'].isnull().values.any()))
#NaN Werte normalisieren
#Wurde schon in der Vorverarbeitung normalisiert
#X_train = X_train.fillna(X_train.mean())

#Datenstruktur so ändern, dass wir pro frame eine Zeile mit 150 Sensordaten erhalten
#Anzahl Zeilen / 15 = Frames
frames = len(X_train.index) / 15

trainingData = np.zeros((int(frames),150), dtype=np.float64)
frame_number = 0
node_index = 0
for i, row in X_train.iterrows():
    trainingData[frame_number][node_index * 10 + 0] = row['ax']
    trainingData[frame_number][node_index * 10 + 1] = row['ay']
    trainingData[frame_number][node_index * 10 + 2] = row['az']
    trainingData[frame_number][node_index * 10 + 3] = row['gx']
    trainingData[frame_number][node_index * 10 + 4] = row['gy']
    trainingData[frame_number][node_index * 10 + 5] = row['gz']
    trainingData[frame_number][node_index * 10 + 6] = row['mx']
    trainingData[frame_number][node_index * 10 + 7] = row['my']
    trainingData[frame_number][node_index * 10 + 8] = row['mz']
    trainingData[frame_number][node_index * 10 + 9] = row['r']
    node_index = node_index + 1
    if(node_index >= 15):
        frame_number = frame_number + 1
        node_index = 0
    
#Die Labels Y_train auch auf jede 15te Zeile reduzieren
trainingLabels = np.zeros((int(frames)), dtype=np.int64)
for i, number in Y_train.iteritems():
    if(i % 15 == 0):
        trainingLabels[int(i / 15)] = int(number)
    

r True
ax False
ay False
az False
gx False
gy False
gz False
mx False
my False
mz False


In [None]:
#Daten einheitlich skalieren von z. B. 0.0 - 1.0
sc = StandardScaler()
sc.fit(trainingData)
trainingData = sc.transform(trainingData)

### Testdaten

In [None]:
#TODO simplify
#Alle nicht Sensordaten entfernen
X_test = strip_test.drop('frame_number',axis = 1)
X_test = X_test.drop('strip_id',axis = 1)
X_test = X_test.drop('node_id',axis = 1)
X_test = X_test.drop('timestamp',axis = 1)

#NaN Werte normalisieren
X_test = X_test.fillna(X_test.mean())

#Datenstruktur so ändern, dass wir pro frame eine Zeile mit 150 Sensordaten erhalten
#Anzahl Zeilen / 15 = Frames
testFrames = len(X_test.index) / 15

testData = np.zeros((int(testFrames),150), dtype=np.float64)
frame_number = 0
node_index = 0
for i, row in X_test.iterrows():
    testData[frame_number][node_index * 10 + 0] = row['ax']
    testData[frame_number][node_index * 10 + 1] = row['ay']
    testData[frame_number][node_index * 10 + 2] = row['az']
    testData[frame_number][node_index * 10 + 3] = row['gx']
    testData[frame_number][node_index * 10 + 4] = row['gy']
    testData[frame_number][node_index * 10 + 5] = row['gz']
    testData[frame_number][node_index * 10 + 6] = row['mx']
    testData[frame_number][node_index * 10 + 7] = row['my']
    testData[frame_number][node_index * 10 + 8] = row['mz']
    testData[frame_number][node_index * 10 + 9] = row['r']
    node_index = node_index + 1
    if(node_index >= 15):
        frame_number = frame_number + 1
        node_index = 0 

In [None]:
#Daten einheitlich skalieren von z. B. 0.0 - 1.0
sc.fit(testData)
testData = sc.transform(testData)

## Model trainieren

In [None]:
#Random Forest
forest = RandomForestClassifier(n_estimators=200, random_state = 0)
forest.fit(trainingData, trainingLabels)

## Prediction

In [None]:
forest_prediction = forest.predict(testData)

f = open("data.csv", "w")
f.write("Id,Predicted\n")
count = strip_offset
for a in forest_prediction:
    f.write(str(count))
    f.write(",")
    f.write(str(int(a)))
    f.write("\n")
    count = count + 1 
f.close()