In [1]:
# Importing the necessary modules
import pandas as pd
from os import listdir
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

In [2]:
files_dir = r"C:\Courses\Master_Degree\Universities\USA\UM_Dearborn\Subjects\ECE_528\Final_Project\OBD-II-Dataset\OBD-II-Dataset"

In [3]:
# Files' names manipulation to remove the unnecessaries
filenames = sorted(set(listdir(files_dir)))
files_names = []
road_types = []
indices_to_delete = [0, 1, 2, 3, 4]
for filename in filenames:
    file_label = filename
    file_label = file_label.lower().strip().split('_')
    if len(file_label) == 7:
        del file_label[0:6]
    else:
        del file_label[0:5]
    file_label = file_label[0]
    file_label = file_label.split('.')
    road_types.append(file_label[0])
    file_label = ".".join(file_label)
    files_names.append(file_label)

In [4]:
dfs = pd.DataFrame()  # create an empty datafrome
for file, road_type in zip(filenames, road_types):
    df = pd.read_csv(files_dir + '/' + file)
    df['road_type'] = road_type  # create a new column for road_type
    df.rename(columns={'Engine Coolant Temperature [Â°C]': 'Engine Coolant Temperature [°C]',
                       'Intake Air Temperature [Â°C]': 'Intake Air Temperature [°C]',
                       'Ambient Air Temperature [Â°C]': 'Ambient Air Temperature [°C]'}, inplace=True)
    dfs = pd.concat([dfs, df], ignore_index=True)

In [5]:
dfs.drop(['Time'], axis = 1, inplace= True)

In [6]:
dfs.head()

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type
0,31.0,,,,,,,,,,stau
1,31.0,96.0,,,,,,,,,stau
2,31.0,96.0,0.0,,,,,,,,stau
3,31.0,96.0,0.0,0.0,,,,,,,stau
4,31.0,96.0,0.0,0.0,22.0,,,,,,stau


In [7]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2543293 entries, 0 to 2543292
Data columns (total 11 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   Engine Coolant Temperature [°C]            float64
 1   Intake Manifold Absolute Pressure [kPa]    float64
 2   Engine RPM [RPM]                           float64
 3   Vehicle Speed Sensor [km/h]                float64
 4   Intake Air Temperature [°C]                float64
 5   Air Flow Rate from Mass Flow Sensor [g/s]  float64
 6   Absolute Throttle Position [%]             float64
 7   Ambient Air Temperature [°C]               float64
 8   Accelerator Pedal Position D [%]           float64
 9   Accelerator Pedal Position E [%]           float64
 10  road_type                                  object 
dtypes: float64(10), object(1)
memory usage: 213.4+ MB


In [8]:
data = dfs.dropna()

In [9]:
data.to_csv(files_dir + '/' + 'combined_data_with_road_type.csv', index=False)

In [10]:
data.isna().sum()

Engine Coolant Temperature [°C]              0
Intake Manifold Absolute Pressure [kPa]      0
Engine RPM [RPM]                             0
Vehicle Speed Sensor [km/h]                  0
Intake Air Temperature [°C]                  0
Air Flow Rate from Mass Flow Sensor [g/s]    0
Absolute Throttle Position [%]               0
Ambient Air Temperature [°C]                 0
Accelerator Pedal Position D [%]             0
Accelerator Pedal Position E [%]             0
road_type                                    0
dtype: int64

In [11]:
# Normalize the numeric columns
# standardizes the values by subtracting the mean and dividing by the standard deviation, 
# making the values have zero mean and unit variance.
scaler = StandardScaler()
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
data.loc[:, numeric_columns] = scaler.fit_transform(data.loc[:, numeric_columns])

# Store the mean and standard deviation values
mean_values = scaler.mean_
std_values = scaler.scale_

print(type(mean_values))
print(type(std_values))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [17]:
print(mean_values[:20])
mean_values.size
print(std_values[:20])
std_values.size

[  81.27801826  125.20496485 1492.62302328   62.1840134    19.37915364
   22.6245401    81.15495026   10.20427343   21.5062732    21.82578027]
[ 18.36526746  31.23571697 530.76228503  45.63638498  19.48075114
  16.10097431  11.5088621   11.42887101  12.51006195  12.54795461]


10

In [18]:
data.head()

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type
9,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau
10,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau
11,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau
12,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau
13,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau


In [19]:
data.to_csv(files_dir + '/normalized_combined_data_with_road_type.csv', index=False)

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2542592 entries, 9 to 2543292
Data columns (total 11 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   Engine Coolant Temperature [°C]            float64
 1   Intake Manifold Absolute Pressure [kPa]    float64
 2   Engine RPM [RPM]                           float64
 3   Vehicle Speed Sensor [km/h]                float64
 4   Intake Air Temperature [°C]                float64
 5   Air Flow Rate from Mass Flow Sensor [g/s]  float64
 6   Absolute Throttle Position [%]             float64
 7   Ambient Air Temperature [°C]               float64
 8   Accelerator Pedal Position D [%]           float64
 9   Accelerator Pedal Position E [%]           float64
 10  road_type                                  object 
dtypes: float64(10), object(1)
memory usage: 232.8+ MB


In [21]:
# One-hot encode the 'road_type' column
encoder = OneHotEncoder()  # Create an instance of OneHotEncoder
encoded_road_type = encoder.fit_transform(data[['road_type']])  # Fit and transform the 'road_type' column
feature_names = encoder.get_feature_names_out(['road_type'])  # Get the feature names from the encoder
encoded_df = pd.DataFrame(encoded_road_type.toarray(), columns=feature_names)  # Create a DataFrame from the encoded data with the feature names
data = pd.concat([data.drop('road_type', axis=1), encoded_df], axis=1)

In [22]:
data.head()

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type_frei,road_type_normal,road_type_stau
9,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0
10,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0
11,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0
12,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0
13,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0


In [23]:
data.to_csv(files_dir + '/one_hot_encoded_normalized_combined_data_with_road_type.csv', index=False)

In [24]:
data.isna().sum()

Engine Coolant Temperature [°C]              701
Intake Manifold Absolute Pressure [kPa]      701
Engine RPM [RPM]                             701
Vehicle Speed Sensor [km/h]                  701
Intake Air Temperature [°C]                  701
Air Flow Rate from Mass Flow Sensor [g/s]    701
Absolute Throttle Position [%]               701
Ambient Air Temperature [°C]                 701
Accelerator Pedal Position D [%]             701
Accelerator Pedal Position E [%]             701
road_type_frei                               701
road_type_normal                             701
road_type_stau                               701
dtype: int64

In [25]:
data = data.dropna()

In [None]:
# Split the data
train, test = train_test_split(data, test_size=0.2)
train.to_csv(files_dir + '/train_data.csv', index=False)
test.to_csv(files_dir + '/test_data.csv', index=False)

In [23]:
'''
# KMeans clustering
n_clusters = 5  # صث can choose a different number based on domain knowledge or use methods like the Elbow method
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(data)
labels = kmeans.labels_

# Add cluster labels to original data
df["Cluster"] = labels

# Analyze clusters
for i in range(n_clusters):
    print(f"Cluster {i}:")
    print(df[df["Cluster"] == i].describe())
    print("--------------------")

# Visualization - just an example, you can visualize more features
for i in range(n_clusters):
    cluster_data = df[df["Cluster"] == i]
    plt.scatter(cluster_data["Engine RPM [RPM]"], cluster_data["Vehicle Speed"])
    plt.title(f"Cluster {i}")
    plt.xlabel("Engine RPM [RPM]")
    plt.ylabel("Vehicle Speed")
    plt.show()
    '''

'\n# KMeans clustering\nn_clusters = 5  # صث can choose a different number based on domain knowledge or use methods like the Elbow method\nkmeans = KMeans(n_clusters=n_clusters)\nkmeans.fit(data)\nlabels = kmeans.labels_\n\n# Add cluster labels to original data\ndf["Cluster"] = labels\n\n# Analyze clusters\nfor i in range(n_clusters):\n    print(f"Cluster {i}:")\n    print(df[df["Cluster"] == i].describe())\n    print("--------------------")\n\n# Visualization - just an example, you can visualize more features\nfor i in range(n_clusters):\n    cluster_data = df[df["Cluster"] == i]\n    plt.scatter(cluster_data["Engine RPM [RPM]"], cluster_data["Vehicle Speed"])\n    plt.title(f"Cluster {i}")\n    plt.xlabel("Engine RPM [RPM]")\n    plt.ylabel("Vehicle Speed")\n    plt.show()\n    '

In [26]:
# Train the model
clf = IsolationForest(contamination=0.01)  # we can adjust contamination based on expected percentage of anomalies
labels = clf.fit_predict(data)

# Anomalies are labeled as -1
anomalies = data[labels == -1]

In [27]:
print(type(anomalies))

<class 'pandas.core.frame.DataFrame'>


In [41]:
anomalies.head(10)

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type_frei,road_type_normal,road_type_stau
2311,-1.97536,2.650653,0.238105,-0.398454,0.59653,1.880971,0.20376,0.682108,2.701324,2.731459,0.0,0.0,1.0
2312,-1.97536,2.650653,0.622458,-0.398454,0.59653,1.880971,0.20376,0.682108,2.701324,2.731459,0.0,0.0,1.0
2313,-1.97536,2.650653,0.622458,-0.288893,0.59653,1.880971,0.20376,0.682108,2.701324,2.731459,0.0,0.0,1.0
2314,-1.97536,2.650653,0.622458,-0.288893,0.647862,1.880971,0.20376,0.682108,2.701324,2.731459,0.0,0.0,1.0
2315,-1.97536,2.650653,0.622458,-0.288893,0.647862,2.413858,0.20376,0.682108,2.701324,2.731459,0.0,0.0,1.0
2316,-1.97536,2.650653,0.622458,-0.288893,0.647862,2.413858,0.20376,0.682108,2.701324,2.731459,0.0,0.0,1.0
2317,-1.97536,2.650653,0.622458,-0.288893,0.647862,2.413858,0.20376,0.682108,2.701324,2.731459,0.0,0.0,1.0
2318,-1.97536,2.650653,0.622458,-0.288893,0.647862,2.413858,0.20376,0.682108,2.165755,2.731459,0.0,0.0,1.0
2326,-1.97536,1.402082,0.726082,-0.201243,0.647862,0.514594,-4.357942,0.682108,2.165755,2.01421,0.0,0.0,1.0
2327,-1.97536,1.402082,0.726082,-0.201243,0.647862,0.514594,-4.357942,0.682108,2.165755,2.01421,0.0,0.0,1.0


In [43]:
columns_to_denormalize = anomalies.columns[:10]  # Select the first 10 columns

subset_data = anomalies[columns_to_denormalize]  # Create a subset DataFrame with only the columns to denormalize

scaler = StandardScaler()  # Create a scaler object

# Set the mean and standard deviation values
scaler.mean_ = mean_values[:10]
scaler.scale_ = std_values[:10]

# Denormalize the subset of data
denormalized_subset = pd.DataFrame(scaler.inverse_transform(subset_data), columns=columns_to_denormalize)

# Combine the denormalized subset with the remaining columns
Final_data = pd.concat([denormalized_subset, anomalies.iloc[:, 10:]], axis=1)

In [45]:
Final_data.head()

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type_frei,road_type_normal,road_type_stau
0,45.0,208.0,1619.0,44.0,31.0,52.91,83.5,18.0,55.3,56.1,,,
1,45.0,208.0,1823.0,44.0,31.0,52.91,83.5,18.0,55.3,56.1,,,
2,45.0,208.0,1823.0,49.0,31.0,52.91,83.5,18.0,55.3,56.1,,,
3,45.0,208.0,1823.0,49.0,32.0,52.91,83.5,18.0,55.3,56.1,,,
4,45.0,208.0,1823.0,49.0,32.0,61.49,83.5,18.0,55.3,56.1,,,
