# Start: Analysiere Daten

In [1]:
# import initial_data.csv
import pandas as pd
import numpy as np

from feasibility import is_feasible

# import data from csv file
def import_data():
    data = pd.read_csv('initial_data.csv')
    return data

data = import_data()

In [2]:
# import queried data
import csv
import re

def read_csv_file(file_path):
    # Initialisiere eine leere Liste, um die bereinigten Daten zu speichern
    cleaned_data = []

    # Lese die CSV-Datei
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')  # Annahme: Semikolon als Trennzeichen
        for row in reader:
            # Verbinde die Zeilenelemente mit einem Komma, um das Trennzeichen zu vereinheitlichen
            unified_row = ','.join(row)
            # Ersetze mehrere aufeinander folgende Kommas durch ein einzelnes Komma
            unified_row = re.sub(r',+', ',', unified_row)
            # Teile die vereinheitlichte Zeile nach dem Komma auf
            split_row = unified_row.split(',')
            # Entferne die ersten zwei Parameter
            cleaned_row = split_row[1:]
            # Füge die bereinigte Zeile der Liste hinzu
            cleaned_data.append(cleaned_row)

    return cleaned_data

# Beispiel: Daten aus "input.csv" einlesen
input_file_path = "querys_ForcePush.csv"
cleaned_data = read_csv_file(input_file_path)

# Bereinigte Daten als pandas DataFrame speichern mit erster Zeile als Spaltennamen
df_queried = pd.DataFrame(cleaned_data[1:], columns=cleaned_data[0])


# Bereinigte Daten ausgeben
# print(df_queried)
# df_queried = pd.DataFrame(cleaned_data)

In [3]:
def add_data(data, queried_data):
    # add queried data (without cost) to initial data
    data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
    return data

data = add_data(data, df_queried)
# print(data)

data = data.astype(float)
print(data.columns)
# drop PM2 column
data = data.drop(columns=['PM 2'])
print(data.columns)
data = data.reset_index(drop=True)

Index(['Engine speed', 'Engine load', 'Railpressure', 'Air supply',
       'Crank angle', 'Intake pressure', 'Back pressure', 'Intake temperature',
       'NOx', 'PM 1', 'CO2', 'PM 2', 'Pressure cylinder'],
      dtype='object')
Index(['Engine speed', 'Engine load', 'Railpressure', 'Air supply',
       'Crank angle', 'Intake pressure', 'Back pressure', 'Intake temperature',
       'NOx', 'PM 1', 'CO2', 'Pressure cylinder'],
      dtype='object')


### Functions to handle data

In [4]:
def get_input_data(data, row):
    x1, x2, x3, x4, x5, x6, x7, x8 = data.iloc[row, 0], data.iloc[row, 1], data.iloc[row, 2], data.iloc[row, 3], data.iloc[row, 4], data.iloc[row, 5], data.iloc[row, 6], data.iloc[row, 7]
    return x1, x2, x3, x4, x5, x6, x7, x8

def get_critical_output_data(data, i):
    x1, x2 = data.iloc[i, 9], data.iloc[i, 11]
    return x1, x2

# print data column 9 and 11
# print(data.iloc[:, 9])
# print(data.iloc[:, 11])

In [5]:
# put column names into a list
column_names = data.columns.values
#print(column_names)
inputs = column_names[0:8]
print("Inputs: ", inputs)
outputs = column_names[8:12]
print("Outputs: ", outputs)

Inputs:  ['Engine speed' 'Engine load' 'Railpressure' 'Air supply' 'Crank angle'
 'Intake pressure' 'Back pressure' 'Intake temperature']
Outputs:  ['NOx' 'PM 1' 'CO2' 'Pressure cylinder']


### Classify safe data

In [6]:
# classify data safety
# Check if outputs are in safe range
# PM 1 < 6, (PM 2 < 16), Pressure cylinder < 160

def label_safe(data):
    safe = []
    for i in range(len(data)):
        x9, x10  = get_critical_output_data(data, i)
        if x9 < 6:
            if x10 < 160:
                safe.append(0)
            else:
                safe.append(2)
        else:
            if x10 < 160:
                safe.append(1)
            else:
                safe.append(2)
    data['safe'] = safe
    return data

data = label_safe(data)
print(data['safe'])

0      0
1      0
2      0
3      0
4      0
      ..
309    1
310    0
311    0
312    0
313    0
Name: safe, Length: 314, dtype: int64


In [7]:
# train a model to predict unsafe output
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor

# split data into training and testing data

# Schritt 1: Feature-Importance bestimmen
X = data[inputs]
y = data['safe']
#print(y.head())
# model = RandomForestRegressor(n_estimators=100)
# model.fit(X, y)
# feature_importances = model.feature_importances_
# print(feature_importances)

# # Schritt 2: Datenpunkte basierend auf Feature-Wichtigkeiten anpassen
# adjustedData = X * feature_importances

#X_train, X_test, y_train, y_test = train_test_split(adjustedData, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X_train = X_train * feature_importances
# count number of False values in y_train and y_test
print("Number of unsafe in training: ",(len(y_train[y_train == 1])+len(y_train[y_train == 2])))
print("Number of unsafe in testing: ",(len(y_test[y_test == 1])+len(y_test[y_test == 2])))

# train a random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# predict on test data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Number of unsafe in training:  29
Number of unsafe in testing:  6
Accuracy: 0.9365079365079365


## Import Submission-Data

In [8]:
# import submission data
submission_data = pd.read_csv('submission.csv')
# print(submission_data)
# rename columns
submission_data.columns = column_names[:8]
print(submission_data.columns)
print(submission_data.head())

Index(['Engine speed', 'Engine load', 'Railpressure', 'Air supply',
       'Crank angle', 'Intake pressure', 'Back pressure',
       'Intake temperature'],
      dtype='object')
   Engine speed  Engine load  Railpressure  Air supply  Crank angle  \
0   2079.242896    15.249091   1059.615682  330.175905    -6.221489   
1    820.449848   120.353061   1271.158901  487.645261     3.472795   
2   1639.495961    83.300535   1601.817725  634.526366    -0.793814   
3   1260.782708   101.754809    828.257891  392.354655     0.122483   
4    903.744741    30.628885   1650.039653  602.760187     6.470709   

   Intake pressure  Back pressure  Intake temperature  
0      1126.065139    3432.348069           73.435318  
1      2084.175034    1985.181081           42.863344  
2      1268.308573    3234.188679           50.848907  
3      2492.140669    1466.726258           72.014909  
4      2631.683432    3797.403335           56.749198  


In [9]:
new_data = pd.DataFrame(submission_data, columns=inputs)

# check feasibleData safety via classifier
X = new_data
print(X.head())
safeDataPoints = []
y_pred = clf.predict(X)
print(y_pred)

for i in range(len(y_pred)):
    if y_pred[i] == 0:
        #append the corrsponding data point to safeDataPoints
        safeDataPoints.append(X.iloc[i])
        
print(len(y_pred[y_pred == 0]))
print(len(y_pred))
# print(len(safeDataPoints) == len(y_pred[y_pred == 0]))

   Engine speed  Engine load  Railpressure  Air supply  Crank angle  \
0   2079.242896    15.249091   1059.615682  330.175905    -6.221489   
1    820.449848   120.353061   1271.158901  487.645261     3.472795   
2   1639.495961    83.300535   1601.817725  634.526366    -0.793814   
3   1260.782708   101.754809    828.257891  392.354655     0.122483   
4    903.744741    30.628885   1650.039653  602.760187     6.470709   

   Intake pressure  Back pressure  Intake temperature  
0      1126.065139    3432.348069           73.435318  
1      2084.175034    1985.181081           42.863344  
2      1268.308573    3234.188679           50.848907  
3      2492.140669    1466.726258           72.014909  
4      2631.683432    3797.403335           56.749198  
[0 2 0 ... 0 0 0]
55402
57892


### Find significant data based on complete real data

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Schritt 1: Feature-Importance bestimmen
complData = data
X = complData[inputs]
y = complData[outputs]
#print(y.head())
model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)
feature_importances = model.feature_importances_
print(feature_importances)

# Schritt 2: Datenpunkte basierend auf Feature-Wichtigkeiten anpassen
adjustedData = X * feature_importances

# Schritt 3: Clusteranalyse durchführen mit angepassten Daten
scaler = StandardScaler()
scaledAdjustedData = scaler.fit_transform(adjustedData)


# elbow method to determine number of clusters
# from matplotlib import pyplot as plt
# inertia = []
# for i in range(1, 10):
#     kmeans = KMeans(n_clusters=i)
#     kmeans.fit(scaledAdjustedData)
#     print(i, kmeans.inertia_)
#     inertia.append(kmeans.inertia_)
# plt.plot(range(1, 10), inertia)
# plt.show()


kmeans = KMeans(n_clusters=6)
clusterLabels = kmeans.fit_predict(scaledAdjustedData)

# Schritt 4: Repräsentative Datenpunkte auswählen
# (Beispiel zeigt, wie man die Clusterlabels zu den ursprünglichen Daten hinzufügt und repräsentative Punkte auswählt)
# freeData = submission_data.droprows(), wenn die Datenpunkte in data enthalten sind

# Angenommen, `inputs` ist eine Liste der Spaltennamen, die für den Vergleich verwendet werden sollen
# Erstelle eine temporäre Kopie von `submission_data`, um die Originaldaten nicht zu verändern
submission_data_filtered = submission_data.copy()

# Schritt 1: Finde Duplikate basierend auf den `inputs` Spalten
# Dieser Schritt erzeugt eine Maske (einen Boolean-Array), der für jede Zeile in `submission_data` True ist, wenn diese Zeile in `data` vorhanden ist
is_duplicate = submission_data_filtered[inputs].apply(tuple, 1).isin(data[inputs].apply(tuple, 1))

# Schritt 2: Lösche die Reihen aus `submission_data_filtered`, die in `data` vorhanden sind
submission_data_filtered = submission_data_filtered[~is_duplicate]

#print len of is duplicates True
#print(len(is_duplicate[is_duplicate == True]))

#print(len(submission_data_filtered), len(submission_data), len(data),len(is_duplicate[is_duplicate == True]), len(submission_data)- len(submission_data_filtered))

# `submission_data_filtered` enthält jetzt nur die Reihen, die nicht in `data[inputs]` vorhanden sind

submission_data_filtered['cluster'] = kmeans.predict(scaler.transform(submission_data_filtered[inputs] * feature_importances))
new_points = []
for cluster in range(6):
    cluster_data = submission_data_filtered[submission_data_filtered['cluster'] == cluster]
    cluster_center = kmeans.cluster_centers_[cluster]
    # Berechne die Distanz unter Berücksichtigung der Feature-Wichtigkeiten
    closest_points, _ = pairwise_distances_argmin_min(cluster_data[inputs] * feature_importances, [cluster_center])
    print(len(closest_points))
    # get the first 3 closest points
    if len(closest_points) > 0:
        new_points.append(cluster_data.iloc[closest_points[0]])
new_points = pd.DataFrame(new_points)
print(new_points)

new_data = pd.DataFrame(new_points, columns=inputs)

# check feasibleData safety via classifier
X = new_data
print(X.head())
safeDataPoints = []
y_pred = clf.predict(X)
print(y_pred)

for i in range(len(y_pred)):
    if y_pred[i] == 0:
        #append the corrsponding data point to safeDataPoints
        safeDataPoints.append(X.iloc[i])
        

print(len(safeDataPoints) == len(y_pred[y_pred == 0]))

# print as formatted string seperated by semi-colon
def print_data(data):
    for i in range(len(data)):
        print(';'.join(map(str, data[i])))
    return

print_data(safeDataPoints)

[0.03232683 0.49272736 0.05005081 0.20438271 0.04780809 0.07291051
 0.03306973 0.06672397]
4372
11256
11132
10282
12787
7887
     Engine speed  Engine load  Railpressure  Air supply  Crank angle  \
248   1063.015398    45.507615    977.743699  223.193991     3.768932   
110   1838.974452    14.896432   2241.793902  664.970662     9.731257   
132   1396.674354    96.773751    971.350605  859.339462     1.801839   
106   1285.700800   115.919188   1457.694878  712.366826     6.503250   
113   1104.901108    13.610494   1256.491470  209.324517     1.656040   
134   1860.923319   130.349427   1080.990382  717.406271    -6.926247   

     Intake pressure  Back pressure  Intake temperature  cluster  
248      1632.981784    2085.641046           53.123996      0.0  
110      2388.305023    3199.235177           67.836881      1.0  
132      1573.182903    1343.227240           64.933878      2.0  
106      1739.550669    1848.219995           71.009443      3.0  
113      2671.024747    2158

In [11]:
# SVR from Philip
from SVRFromPhilip import getSVRPrediction

np.set_printoptions(suppress=True)

# # print(getSVRPrediction(safeDataPoints))
# getSVRPrediction(safeDataPoints)
print("Accuracy: ", getSVRPrediction(safeDataPoints)[0])
print("Predictions: ", getSVRPrediction(safeDataPoints)[1])

Accuracy:  [84.65535037403363, 38.65243912136152, 89.44249663317495, 96.96979359417749]
Predictions:  [[164.25530148   4.30352583  37.66464912  79.77385922]
 [171.69965463  -0.43626514  57.64801622 100.28527866]
 [215.27242893   4.1174666   70.00915413  91.6628365 ]
 [197.87283519   3.75154567  79.84271879 104.22751137]
 [157.35150858   2.27868379  50.72783183  84.31500849]
 [242.26152855  -3.39933792  81.29451839 100.90897623]]


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


In [12]:
# Model from Simon
from PLSFromSimon import getPLSPrediction

# print(getPLSPrediction(safeDataPoints))
print("Accuracy: ", getPLSPrediction(safeDataPoints)[0])
print("Predictions: ", getPLSPrediction(safeDataPoints)[1])


Accuracy:  [31.324966431159595, 34.21039679493539, 82.4827693671588, 97.39519012349669]
Predictions:  [[187.92081182  -3.95471531  -6.86524858 114.35675682]
 [-14.32138663  -3.89146544  22.72589868 151.18418503]
 [329.96771303  13.3009797  181.47835835  60.94915202]
 [284.5565258   10.14868216 168.41994409 101.09882464]
 [  6.92113191 -10.27704859 -73.75461524  92.93791799]
 [583.83073334   3.22992814  99.04594256  85.2540638 ]]


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


In [13]:
from tryRidge import getRidgePrediction

print("Accuracy: ", getRidgePrediction(safeDataPoints)[0])
print("Predictions: ", getRidgePrediction(safeDataPoints)[1])

Accuracy:  [32.738754823131416, 29.932528567692202, 81.78687330166002, 97.20252272153886]
Predictions:  [[187.92244534  -2.43527839   8.35080864 108.48430842]
 [ -2.63030607  -1.33087279  49.56332586 146.44101477]
 [326.66970076   9.90830136 147.26157442  69.23185339]
 [281.11371177   7.44637927 139.81054137 102.59001373]
 [ 25.74001491  -5.64366778 -22.77014475 100.31990805]
 [572.7531577    2.93175242  96.929274    89.1290207 ]]


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
