# Start: Analysiere Daten

In [26]:
# import initial_data.csv
import pandas as pd
import numpy as np

from feasibility import is_feasible

# import data from csv file
def import_data():
    data = pd.read_csv('initial_data.csv')
    return data

data = import_data()

In [27]:
# import queried data
import csv
import re

def read_csv_file(file_path):
    # Initialisiere eine leere Liste, um die bereinigten Daten zu speichern
    cleaned_data = []

    # Lese die CSV-Datei
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')  # Annahme: Semikolon als Trennzeichen
        for row in reader:
            # Verbinde die Zeilenelemente mit einem Komma, um das Trennzeichen zu vereinheitlichen
            unified_row = ','.join(row)
            # Ersetze mehrere aufeinander folgende Kommas durch ein einzelnes Komma
            unified_row = re.sub(r',+', ',', unified_row)
            # Teile die vereinheitlichte Zeile nach dem Komma auf
            split_row = unified_row.split(',')
            # Entferne die ersten zwei Parameter
            cleaned_row = split_row[1:]
            # Füge die bereinigte Zeile der Liste hinzu
            cleaned_data.append(cleaned_row)

    return cleaned_data

# Beispiel: Daten aus "input.csv" einlesen
input_file_path = "querys_ForcePush.csv"
cleaned_data = read_csv_file(input_file_path)

# Bereinigte Daten als pandas DataFrame speichern mit erster Zeile als Spaltennamen
df_queried = pd.DataFrame(cleaned_data[1:], columns=cleaned_data[0])


# Bereinigte Daten ausgeben
# print(df_queried)
# df_queried = pd.DataFrame(cleaned_data)

In [28]:
def add_data(data, queried_data):
    # add queried data (without cost) to initial data
    data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
    return data

data = add_data(data, df_queried)
# print(data)

data = data.astype(float)
print(data.columns)
# drop PM2 column
data = data.drop(columns=['PM 2'])
print(data.columns)
data = data.reset_index(drop=True)

Index(['Engine speed', 'Engine load', 'Railpressure', 'Air supply',
       'Crank angle', 'Intake pressure', 'Back pressure', 'Intake temperature',
       'NOx', 'PM 1', 'CO2', 'PM 2', 'Pressure cylinder'],
      dtype='object')
Index(['Engine speed', 'Engine load', 'Railpressure', 'Air supply',
       'Crank angle', 'Intake pressure', 'Back pressure', 'Intake temperature',
       'NOx', 'PM 1', 'CO2', 'Pressure cylinder'],
      dtype='object')


### Functions to handle data

In [29]:
def get_input_data(data, row):
    x1, x2, x3, x4, x5, x6, x7, x8 = data.iloc[row, 0], data.iloc[row, 1], data.iloc[row, 2], data.iloc[row, 3], data.iloc[row, 4], data.iloc[row, 5], data.iloc[row, 6], data.iloc[row, 7]
    return x1, x2, x3, x4, x5, x6, x7, x8

def get_critical_output_data(data, i):
    x1, x2 = data.iloc[i, 9], data.iloc[i, 11]
    return x1, x2

# print data column 9 and 11
# print(data.iloc[:, 9])
# print(data.iloc[:, 11])

In [30]:
# put column names into a list
column_names = data.columns.values
#print(column_names)
inputs = column_names[0:8]
print("Inputs: ", inputs)
outputs = column_names[8:12]
print("Outputs: ", outputs)

Inputs:  ['Engine speed' 'Engine load' 'Railpressure' 'Air supply' 'Crank angle'
 'Intake pressure' 'Back pressure' 'Intake temperature']
Outputs:  ['NOx' 'PM 1' 'CO2' 'Pressure cylinder']


### Classify safe data

In [31]:
# classify data safety
# Check if outputs are in safe range
# PM 1 < 6, (PM 2 < 16), Pressure cylinder < 160

def label_safe(data):
    safe = []
    for i in range(len(data)):
        x9, x10  = get_critical_output_data(data, i)
        if x9 < 6:
            if x10 < 160:
                safe.append(0)
            else:
                safe.append(2)
        else:
            if x10 < 160:
                safe.append(1)
            else:
                safe.append(2)
    data['safe'] = safe
    return data

data = label_safe(data)
print(data['safe'])

0      0
1      0
2      0
3      0
4      0
      ..
158    0
159    2
160    0
161    0
162    0
Name: safe, Length: 163, dtype: int64


In [32]:
# train a model to predict unsafe output
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor

# split data into training and testing data

# Schritt 1: Feature-Importance bestimmen
X = data[inputs]
y = data['safe']
#print(y.head())
# model = RandomForestRegressor(n_estimators=100)
# model.fit(X, y)
# feature_importances = model.feature_importances_
# print(feature_importances)

# # Schritt 2: Datenpunkte basierend auf Feature-Wichtigkeiten anpassen
# adjustedData = X * feature_importances

#X_train, X_test, y_train, y_test = train_test_split(adjustedData, y, test_size=0.3, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X_train = X_train * feature_importances
# count number of False values in y_train and y_test
print("Number of unsafe in training: ",(len(y_train[y_train == 1])+len(y_train[y_train == 2])))
print("Number of unsafe in testing: ",(len(y_test[y_test == 1])+len(y_test[y_test == 2])))

# train a random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# predict on test data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Number of unsafe in training:  11
Number of unsafe in testing:  3
Accuracy: 0.9090909090909091


## Import Submission-Data

In [33]:
# import submission data
submission_data = pd.read_csv('submission.csv')
# print(submission_data)
# rename columns
submission_data.columns = column_names[:8]
print(submission_data.columns)
print(submission_data.head())

Index(['Engine speed', 'Engine load', 'Railpressure', 'Air supply',
       'Crank angle', 'Intake pressure', 'Back pressure',
       'Intake temperature'],
      dtype='object')
   Engine speed  Engine load  Railpressure  Air supply  Crank angle  \
0   2079.242896    15.249091   1059.615682  330.175905    -6.221489   
1    820.449848   120.353061   1271.158901  487.645261     3.472795   
2   1639.495961    83.300535   1601.817725  634.526366    -0.793814   
3   1260.782708   101.754809    828.257891  392.354655     0.122483   
4    903.744741    30.628885   1650.039653  602.760187     6.470709   

   Intake pressure  Back pressure  Intake temperature  
0      1126.065139    3432.348069           73.435318  
1      2084.175034    1985.181081           42.863344  
2      1268.308573    3234.188679           50.848907  
3      2492.140669    1466.726258           72.014909  
4      2631.683432    3797.403335           56.749198  


In [34]:
new_data = pd.DataFrame(submission_data, columns=inputs)

# check feasibleData safety via classifier
X = new_data
print(X.head())
safeDataPoints = []
y_pred = clf.predict(X)
print(y_pred)

for i in range(len(y_pred)):
    if y_pred[i] == 0:
        #append the corrsponding data point to safeDataPoints
        safeDataPoints.append(X.iloc[i])
        
print(len(y_pred[y_pred == 0]))
print(len(y_pred))
# print(len(safeDataPoints) == len(y_pred[y_pred == 0]))

   Engine speed  Engine load  Railpressure  Air supply  Crank angle  \
0   2079.242896    15.249091   1059.615682  330.175905    -6.221489   
1    820.449848   120.353061   1271.158901  487.645261     3.472795   
2   1639.495961    83.300535   1601.817725  634.526366    -0.793814   
3   1260.782708   101.754809    828.257891  392.354655     0.122483   
4    903.744741    30.628885   1650.039653  602.760187     6.470709   

   Intake pressure  Back pressure  Intake temperature  
0      1126.065139    3432.348069           73.435318  
1      2084.175034    1985.181081           42.863344  
2      1268.308573    3234.188679           50.848907  
3      2492.140669    1466.726258           72.014909  
4      2631.683432    3797.403335           56.749198  
[0 0 0 ... 0 0 0]
57478
57892


### Find significant data based on complete real data

In [35]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

# Schritt 1: Feature-Importance bestimmen
complData = data
X = complData[inputs]
y = complData[outputs]
#print(y.head())
model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)
feature_importances = model.feature_importances_
print(feature_importances)

# Schritt 2: Datenpunkte basierend auf Feature-Wichtigkeiten anpassen
adjustedData = X * feature_importances

# Schritt 3: Clusteranalyse durchführen mit angepassten Daten
scaler = StandardScaler()
scaledAdjustedData = scaler.fit_transform(adjustedData)


# elbow method to determine number of clusters
# from matplotlib import pyplot as plt
# inertia = []
# for i in range(1, 10):
#     kmeans = KMeans(n_clusters=i)
#     kmeans.fit(scaledAdjustedData)
#     print(i, kmeans.inertia_)
#     inertia.append(kmeans.inertia_)
# plt.plot(range(1, 10), inertia)
# plt.show()


kmeans = KMeans(n_clusters=6)
clusterLabels = kmeans.fit_predict(scaledAdjustedData)

# Schritt 4: Repräsentative Datenpunkte auswählen
# (Beispiel zeigt, wie man die Clusterlabels zu den ursprünglichen Daten hinzufügt und repräsentative Punkte auswählt)
# freeData = submission_data.droprows(), wenn die Datenpunkte in data enthalten sind

# Angenommen, `inputs` ist eine Liste der Spaltennamen, die für den Vergleich verwendet werden sollen
# Erstelle eine temporäre Kopie von `submission_data`, um die Originaldaten nicht zu verändern
submission_data_filtered = submission_data.copy()

# Schritt 1: Finde Duplikate basierend auf den `inputs` Spalten
# Dieser Schritt erzeugt eine Maske (einen Boolean-Array), der für jede Zeile in `submission_data` True ist, wenn diese Zeile in `data` vorhanden ist
is_duplicate = submission_data_filtered[inputs].apply(tuple, 1).isin(data[inputs].apply(tuple, 1))

# Schritt 2: Lösche die Reihen aus `submission_data_filtered`, die in `data` vorhanden sind
submission_data_filtered = submission_data_filtered[~is_duplicate]

#print len of is duplicates True
#print(len(is_duplicate[is_duplicate == True]))

#print(len(submission_data_filtered), len(submission_data), len(data),len(is_duplicate[is_duplicate == True]), len(submission_data)- len(submission_data_filtered))

# `submission_data_filtered` enthält jetzt nur die Reihen, die nicht in `data[inputs]` vorhanden sind

submission_data_filtered['cluster'] = kmeans.predict(scaler.transform(submission_data_filtered[inputs] * feature_importances))
new_points = []
for cluster in range(6):
    cluster_data = submission_data_filtered[submission_data_filtered['cluster'] == cluster]
    cluster_center = kmeans.cluster_centers_[cluster]
    # Berechne die Distanz unter Berücksichtigung der Feature-Wichtigkeiten
    closest_points, _ = pairwise_distances_argmin_min(cluster_data[inputs] * feature_importances, [cluster_center])
    print(len(closest_points))
    # get the first 3 closest points
    if len(closest_points) > 0:
        new_points.append(cluster_data.iloc[closest_points[0]])
new_points = pd.DataFrame(new_points)
print(new_points)

new_data = pd.DataFrame(new_points, columns=inputs)

# check feasibleData safety via classifier
X = new_data
print(X.head())
safeDataPoints = []
y_pred = clf.predict(X)
print(y_pred)

for i in range(len(y_pred)):
    if y_pred[i] == 0:
        #append the corrsponding data point to safeDataPoints
        safeDataPoints.append(X.iloc[i])
        

print(len(safeDataPoints) == len(y_pred[y_pred == 0]))

# print as formatted string seperated by semi-colon
def print_data(data):
    for i in range(len(data)):
        print(';'.join(map(str, data[i])))
    return

print_data(safeDataPoints)

[0.01989823 0.56142685 0.05476143 0.07431341 0.09229494 0.08564932
 0.03309019 0.07856563]
5663
3779
9435
16794
3036
19154
    Engine speed  Engine load  Railpressure  Air supply  Crank angle  \
12   1911.406720    97.893128    852.417958  958.113591    -0.131095   
11    988.948241    38.966357   1471.254208  179.178156     0.869117   
23   1414.956297    52.064432    637.906118  911.010384    -3.168024   
0    2079.242896    15.249091   1059.615682  330.175905    -6.221489   
39   1608.491692    18.610722   1134.982264  230.543761     2.512165   
3    1260.782708   101.754809    828.257891  392.354655     0.122483   

    Intake pressure  Back pressure  Intake temperature  cluster  
12      2238.063748    3652.007343           76.263746      0.0  
11      1186.477575    2334.776826           51.094320      1.0  
23      1726.091088    1635.553227           46.924010      2.0  
0       1126.065139    3432.348069           73.435318      3.0  
39      2015.323742    1137.073658        

In [36]:
# SVR from Philip
from SVRFromPhilip import getSVRPrediction

# print(getSVRPrediction(safeDataPoints))
print("Accuracy: ", getSVRPrediction(safeDataPoints)[0])
print("Predictions: ", getSVRPrediction(safeDataPoints)[1])

Accuracy:  [84.65667347794793, 36.49911892967837, 89.42462870367467, 96.96934705222473]
Predictions:  [[2.24641234e+02 1.80341203e-01 7.87108190e+01 1.06964842e+02]
 [1.51722806e+02 3.13486978e+00 4.09172491e+01 8.03351874e+01]
 [2.37802669e+02 2.23774121e+00 6.23715916e+01 9.31151664e+01]
 [1.82866074e+02 1.05547346e+00 6.04729580e+01 9.00591211e+01]
 [9.66161735e+01 3.39503337e+00 3.09684347e+01 6.11943280e+01]
 [2.04207379e+02 4.47766785e+00 7.20452881e+01 8.98503262e+01]]


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


In [40]:
# Model from Simon
from PLSFromSimon import getPLSPrediction

# print(getPLSPrediction(safeDataPoints))
print("Accuracy: ", getPLSPrediction(safeDataPoints)[0])
print("Predictions: ", getPLSPrediction(safeDataPoints)[1])


Accuracy:  [31.32496643115931, 34.21039679493548, 82.48276936715895, 97.39519012349669]
Predictions:  [[ 4.19432940e+02  5.76979012e+00  1.42162160e+02  1.59652917e+02]
 [ 2.82171700e+01 -2.97225243e+00  3.28750651e+00  1.29007215e+02]
 [ 3.35685066e+02  8.49724998e+00  1.18841583e+02  6.61667174e+01]
 [ 6.44267794e+01 -3.54980771e+00 -7.67290085e-01  1.37415811e+02]
 [ 1.70134523e+02 -5.82182533e+00 -4.59678677e+01  4.49158868e+01]
 [ 3.48697175e+02  1.90426326e-01  4.90260174e+01  6.26566158e+01]]


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


In [41]:
from polyReg import getPolyRegPrediction

print("Accuracy: ", getPolyRegPrediction(safeDataPoints)[0])
print("Predictions: ", getPolyRegPrediction(safeDataPoints)[1])



ImportError: cannot import name 'getPolyRegPrediction' from 'poly_reg' (c:\Users\boeke\Desktop\Datensicherheit\force-push\poly_reg.py)