# SAIL Hackathon 2024

Team: Force Push
Members: Martin Doppstadt, Simon Böke, Philip Holstein, Felix Schnüll, Laura Schöne

## Start: Import Data

In [1]:
# import initial_data.csv
import pandas as pd
import numpy as np
import csv
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import StandardScaler


# import data from csv file
def import_data():
    data = pd.read_csv('initial_data.csv')
    return data

data = import_data()

In [2]:
# import queried data

def read_csv_file(file_path):
    # Initialisiere eine leere Liste, um die bereinigten Daten zu speichern
    cleaned_data = []

    # Lese die CSV-Datei
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')  # Annahme: Semikolon als Trennzeichen
        for row in reader:
            # Verbinde die Zeilenelemente mit einem Komma, um das Trennzeichen zu vereinheitlichen
            unified_row = ','.join(row)
            # Ersetze mehrere aufeinander folgende Kommas durch ein einzelnes Komma
            unified_row = re.sub(r',+', ',', unified_row)
            # Teile die vereinheitlichte Zeile nach dem Komma auf
            split_row = unified_row.split(',')
            # Entferne die ersten zwei Parameter
            cleaned_row = split_row[1:]
            # Füge die bereinigte Zeile der Liste hinzu
            cleaned_data.append(cleaned_row)

    return cleaned_data

# Beispiel: Daten aus "input.csv" einlesen
input_file_path = "querys_ForcePush.csv"
cleaned_data = read_csv_file(input_file_path)

# Bereinigte Daten als pandas DataFrame speichern mit erster Zeile als Spaltennamen
df_queried = pd.DataFrame(cleaned_data[1:], columns=cleaned_data[0])

### Add initial_data and querys

In [3]:
def add_data(data, queried_data):
    # add queried data (without cost) to initial data
    data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
    return data

data = add_data(data, df_queried)
# print(data)

data = data.astype(float)
# print(data.columns)
# drop PM2 column
data = data.drop(columns=['PM 2'])
# print(data.columns)
data = data.reset_index(drop=True)

### Functions to handle data

In [4]:
def get_input_data(data, row):
    x1, x2, x3, x4, x5, x6, x7, x8 = data.iloc[row, 0], data.iloc[row, 1], data.iloc[row, 2], data.iloc[row, 3], data.iloc[row, 4], data.iloc[row, 5], data.iloc[row, 6], data.iloc[row, 7]
    return x1, x2, x3, x4, x5, x6, x7, x8

def get_critical_output_data(data, i):
    x1, x2 = data.iloc[i, 9], data.iloc[i, 11]
    return x1, x2

# print data column 9 and 11 (PM 1 and Pressure cylinder)
# print(data.iloc[:, 9])
# print(data.iloc[:, 11])

### Inputs and Outputs columns

In [5]:
# put column names into a list
column_names = data.columns.values
#print(column_names)
inputs = column_names[0:8]
print("Inputs: ", inputs)
outputs = column_names[8:12]
print("Outputs: ", outputs)

Inputs:  ['Engine speed' 'Engine load' 'Railpressure' 'Air supply' 'Crank angle'
 'Intake pressure' 'Back pressure' 'Intake temperature']
Outputs:  ['NOx' 'PM 1' 'CO2' 'Pressure cylinder']


### Classify safe data and add label column

In [6]:
# classify data safety
# Check if outputs are in safe range
# PM 1 < 6, (PM 2 < 16), Pressure cylinder < 160

def label_safe(data):
    safe = []
    for i in range(len(data)):
        x9, x10  = get_critical_output_data(data, i)
        if x9 < 6:
            if x10 < 160:
                safe.append(0)
            else:
                safe.append(2)
        else:
            if x10 < 160:
                safe.append(1)
            else:
                safe.append(2)
    data['safe'] = safe
    return data

data = label_safe(data)
#print(data['safe'])

### Train RandomForestClassifier to predict safe and unsafe data 

In [7]:
# train a model to predict unsafe output
X = data[inputs]
y = data['safe']

# split data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Number of unsafe in training: ",(len(y_train[y_train == 1])+len(y_train[y_train == 2])))
print("Number of unsafe in testing: ",(len(y_test[y_test == 1])+len(y_test[y_test == 2])))

# train a random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# predict on test data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Number of unsafe in training:  39
Number of unsafe in testing:  9
Accuracy: 0.9113924050632911


## Import Submission-Data

In [8]:
# import submission data
submission_data = pd.read_csv('submission.csv')
# print(submission_data)
# rename columns to same columns as complete data
submission_data.columns = column_names[:8]
# print(submission_data.columns)
# print(submission_data.head())

### Check possible new data points from submissions for safety

In [9]:
new_data = pd.DataFrame(submission_data, columns=inputs)

# from feasibility import is_feasible
# Feasibility was checked beforehand for data in submission.csv --> all data points are feasible

# check newData safety via classifier
X = new_data
# print(X.head())
safeDataPoints = []
y_pred = clf.predict(X)
# print(y_pred)

for i in range(len(y_pred)):
    if y_pred[i] == 0:
        # append the corresponding data point to safeDataPoints
        safeDataPoints.append(X.iloc[i])
        
print(len(y_pred[y_pred == 0]))
print(len(y_pred))
# print(len(safeDataPoints) == len(y_pred[y_pred == 0]))

54075
57892


## Find significant data based on complete "real" data

In [10]:
# Schritt 1: Feature-Importance bestimmen
complData = data
X = complData[inputs]
y = complData[outputs]
#print(y.head())
model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)
feature_importances = model.feature_importances_
# print(feature_importances)

# Schritt 2: Datenpunkte basierend auf Feature-Wichtigkeiten anpassen
adjustedData = X * feature_importances

# Schritt 3: Clusteranalyse durchführen mit angepassten Daten
scaler = StandardScaler()
scaledAdjustedData = scaler.fit_transform(adjustedData)


# Schritt 3.1: Elbow method to determine number of clusters

# from matplotlib import pyplot as plt
# inertia = []
# for i in range(1, 10):
#     kmeans = KMeans(n_clusters=i)
#     kmeans.fit(scaledAdjustedData)
#     print(i, kmeans.inertia_)
#     inertia.append(kmeans.inertia_)
# plt.plot(range(1, 10), inertia)
# plt.show()

kmeans = KMeans(n_clusters=6)
clusterLabels = kmeans.fit_predict(scaledAdjustedData)

# Schritt 4: Repräsentative Datenpunkte auswählen in der Nähe der Clusterzentren

# Schritt 4.1: Datenpunkte aus submission.csv auswählen, die noch nicht gequeriet wurden

submission_data_filtered = submission_data.copy()
is_duplicate = submission_data_filtered[inputs].apply(tuple, 1).isin(data[inputs].apply(tuple, 1))
submission_data_filtered = submission_data_filtered[~is_duplicate]

#print len of is duplicates True
#print(len(is_duplicate[is_duplicate == True]))

submission_data_filtered['cluster'] = kmeans.predict(scaler.transform(submission_data_filtered[inputs] * feature_importances))
new_points = []
for cluster in range(6):
    cluster_data = submission_data_filtered[submission_data_filtered['cluster'] == cluster]
    cluster_center = kmeans.cluster_centers_[cluster]
    # Berechne die Distanz unter Berücksichtigung der Feature-Wichtigkeiten
    closest_points, _ = pairwise_distances_argmin_min(cluster_data[inputs] * feature_importances, [cluster_center])
    # get the closest point
    if len(closest_points) > 0:
        new_points.append(cluster_data.iloc[closest_points[0]])
new_points = pd.DataFrame(new_points)
# print(new_points)

new_data = pd.DataFrame(new_points, columns=inputs)

# check newDataPoints safety via classifier
X = new_data
# print(X.head())
safeDataPoints = []
y_pred = clf.predict(X)
print(y_pred)

for i in range(len(y_pred)):
    if y_pred[i] == 0:
        # append the corrsponding data point to safeDataPoints
        safeDataPoints.append(X.iloc[i])
        

# print(len(safeDataPoints) == len(y_pred[y_pred == 0]))

# print as formatted string seperated by semi-colon (to ease copy-pasting for Queries)
def print_data(data):
    for i in range(len(data)):
        print(';'.join(map(str, data[i])))
    return

print_data(safeDataPoints)

[0 0 0 0 0 0]
1005.6904345750808;22.860694856250767;1537.7016626647037;463.57024063494424;0.4202272556722164;1805.398915270618;1508.9104433589675;59.62588668260171
1838.9744520187376;14.896431590448667;2241.7939017306735;664.9706615381898;9.731257443798269;2388.3050230270883;3199.2351766715774;67.83688051182946
1686.7507934570312;133.74633660763865;2363.175705237151;984.6811989482665;-7.850949778205514;1708.7789804889453;2960.2142682774747;52.002105463977834
2242.3091664910316;95.669230241097;2125.724965978517;1208.109805903974;-1.861551116707851;2994.345297157826;2919.615742479734;44.292533842106145
1248.1164753437042;67.74137770757079;1197.761941091458;696.0805674723846;-7.835074495524168;1476.4820741136205;1359.7904681209757;60.469301900945354
1254.9142211675644;4.751905631273985;2011.0473093833705;525.8536759920767;-2.8714774013496935;1932.0728100116123;3433.1271588530103;74.19617957680015


## SVR Model with hyperparameter Search, Training, Prediction (and Cross-Validation) for each output Parameter

In [11]:
from SVRModel import getSVRPrediction

np.set_printoptions(suppress=True)
for i in range(0,4):
    # feedback = getSVRPrediction(safeDataPoints, i, hyperParas[i])
    feedback = getSVRPrediction(safeDataPoints, i)
    print("Accuracy i: ", feedback[0])
    print("Predictions i: ", feedback[1])

# make predicition for all data points in submission.csv
# Outputs:  ['NOx' 'PM 1' 'CO2' 'Pressure cylinder']
nox = []
pm1 = []
co2 = []
pressure = []
predictions = [nox, pm1, co2, pressure]

submission_data = pd.read_csv('submission.csv')
# rename columns to same columns as complete data
submission_data.columns = column_names[:8]
submission_data = pd.DataFrame(submission_data, columns=inputs)

for i in range(len(predictions)):
    feedback = getSVRPrediction(submission_data, i)
    predictions[i] = feedback[1]
for i in range(len(predictions)):
    predictions[i] = pd.DataFrame(predictions[i], columns=[outputs[i]])

submission_data['NOx'] = predictions[0]
submission_data['PM 1'] = predictions[1]
submission_data['CO2'] = predictions[2]
submission_data['Pressure cylinder'] = predictions[3]

# write to csv
submission_data.to_csv('submission_final.csv', index=False)


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


Beste Parameter:  0 {'estimator__C': 200, 'estimator__epsilon': 1, 'estimator__kernel': 'rbf'}
Bester Score: 0 -2249.8713465897
Accuracy i:  84.67072391084065
Predictions i:  [[114.24412075]
 [171.11336468]
 [204.65273225]
 [229.18417868]
 [140.63972541]
 [176.02921462]]


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


Beste Parameter:  1 {'estimator__C': 10, 'estimator__epsilon': 0.1, 'estimator__kernel': 'rbf'}
Bester Score: 1 -0.36127943365884607
Accuracy i:  46.44478470811493
Predictions i:  [[1.7476009 ]
 [1.66733296]
 [1.86765163]
 [2.42161355]
 [2.08942984]
 [1.37685393]]
Beste Parameter:  2 {'estimator__C': 200, 'estimator__epsilon': 1, 'estimator__kernel': 'linear'}
Bester Score: 2 -15.386471572008166
Accuracy i:  85.00918056263451
Predictions i:  [[41.42653994]
 [57.42478728]
 [76.28162022]
 [80.42245231]
 [61.67628244]
 [60.49642592]]


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


Beste Parameter:  3 {'estimator__C': 200, 'estimator__epsilon': 0.01, 'estimator__kernel': 'linear'}
Bester Score: 3 -7.7757278450491
Accuracy i:  97.39531667162007
Predictions i:  [[ 62.70414746]
 [148.70949498]
 [156.11861981]
 [130.98018116]
 [ 50.17602396]
 [145.83348296]]
Beste Parameter:  0 {'estimator__C': 200, 'estimator__epsilon': 1, 'estimator__kernel': 'rbf'}
Bester Score: 0 -2249.8713465897


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)
  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


Beste Parameter:  1 {'estimator__C': 10, 'estimator__epsilon': 0.1, 'estimator__kernel': 'rbf'}
Bester Score: 1 -0.36127943365884607


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


Beste Parameter:  2 {'estimator__C': 200, 'estimator__epsilon': 1, 'estimator__kernel': 'linear'}
Bester Score: 2 -15.386471572008166


  data = pd.concat([data, queried_data.iloc[:, :13]], axis=0)


Beste Parameter:  3 {'estimator__C': 200, 'estimator__epsilon': 0.01, 'estimator__kernel': 'linear'}
Bester Score: 3 -7.7757278450491




### Other models that did not make it to final

In [12]:
# Model from Simon
# from PLSFromSimon import getPLSPrediction

# # print(getPLSPrediction(safeDataPoints))
# print("Accuracy: ", getPLSPrediction(safeDataPoints)[0])
# print("Predictions: ", getPLSPrediction(safeDataPoints)[1])


In [13]:
# from tryRidge import getRidgePrediction

# print("Accuracy: ", getRidgePrediction(safeDataPoints)[0])
# print("Predictions: ", getRidgePrediction(safeDataPoints)[1])