<a href="https://colab.research.google.com/github/maboulsaad82/ece528/blob/main/ece528_by_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
'''Project Description
The Preventive Maintenance Warning in Vehicles project aims to leverage AI technology to detect potential issues in vehicles before they become major problems.
By analyzing various sensor data and vehicle performance metrics, the AI system can identify patterns and anomalies that indicate the need for maintenance or repairs.
This proactive approach helps vehicle owners and service providers to take timely actions, reducing breakdowns and improving overall vehicle reliability.'''



In [2]:
# Importing the necessary modules
import pandas as pd
from os import listdir
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

In [3]:
!rm -rf ece528

In [4]:
!git clone https://github.com/maboulsaad82/ece528.git

Cloning into 'ece528'...
remote: Enumerating objects: 88, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 88 (delta 0), reused 0 (delta 0), pack-reused 81[K
Receiving objects: 100% (88/88), 13.22 MiB | 7.40 MiB/s, done.
Updating files: 100% (79/79), done.


In [5]:
files_dir = "/content/ece528/OBD-II-Dataset/"
output_dir = "/content/ece528/output_files/"

In [6]:
# Files' names manipulation to remove the unnecessaries
filenames = sorted(set(listdir(files_dir)))
files_names = []
road_types = []
indices_to_delete = [0, 1, 2, 3, 4]
for filename in filenames:
    file_label = filename
    file_label = file_label.lower().strip().split('_')
    if len(file_label) == 7:
        del file_label[0:6]
    else:
        del file_label[0:5]
    file_label = file_label[0]
    file_label = file_label.split('.')
    road_types.append(file_label[0])
    file_label = ".".join(file_label)
    files_names.append(file_label)

In [7]:
dfs = pd.DataFrame()  # create an empty datafrome
for file, road_type in zip(filenames, road_types):
    df = pd.read_csv(files_dir + '/' + file)
    df['road_type'] = road_type  # create a new column for road_type
    df.rename(columns={'Engine Coolant Temperature [Â°C]': 'Engine Coolant Temperature [°C]',
                       'Intake Air Temperature [Â°C]': 'Intake Air Temperature [°C]',
                       'Ambient Air Temperature [Â°C]': 'Ambient Air Temperature [°C]'}, inplace=True)
    dfs = pd.concat([dfs, df], ignore_index=True)

In [8]:
dfs.drop(['Time'], axis = 1, inplace= True)

In [9]:
dfs.head()

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type
0,31.0,,,,,,,,,,stau
1,31.0,96.0,,,,,,,,,stau
2,31.0,96.0,0.0,,,,,,,,stau
3,31.0,96.0,0.0,0.0,,,,,,,stau
4,31.0,96.0,0.0,0.0,22.0,,,,,,stau


In [10]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2543293 entries, 0 to 2543292
Data columns (total 11 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   Engine Coolant Temperature [°C]            float64
 1   Intake Manifold Absolute Pressure [kPa]    float64
 2   Engine RPM [RPM]                           float64
 3   Vehicle Speed Sensor [km/h]                float64
 4   Intake Air Temperature [°C]                float64
 5   Air Flow Rate from Mass Flow Sensor [g/s]  float64
 6   Absolute Throttle Position [%]             float64
 7   Ambient Air Temperature [°C]               float64
 8   Accelerator Pedal Position D [%]           float64
 9   Accelerator Pedal Position E [%]           float64
 10  road_type                                  object 
dtypes: float64(10), object(1)
memory usage: 213.4+ MB


In [11]:
data = dfs.dropna()

In [12]:
data.to_csv(output_dir + '/' + 'combined_data_with_road_type.csv', index=False)

In [13]:
!git branch

fatal: not a git repository (or any of the parent directories): .git


In [14]:
!git remote -v


fatal: not a git repository (or any of the parent directories): .git


# New Section

In [15]:
data.isna().sum()

Engine Coolant Temperature [°C]              0
Intake Manifold Absolute Pressure [kPa]      0
Engine RPM [RPM]                             0
Vehicle Speed Sensor [km/h]                  0
Intake Air Temperature [°C]                  0
Air Flow Rate from Mass Flow Sensor [g/s]    0
Absolute Throttle Position [%]               0
Ambient Air Temperature [°C]                 0
Accelerator Pedal Position D [%]             0
Accelerator Pedal Position E [%]             0
road_type                                    0
dtype: int64

In [16]:
# Normalize the numeric columns
# standardizes the values by subtracting the mean and dividing by the standard deviation,
# making the values have zero mean and unit variance.
scaler = StandardScaler()
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
data.loc[:, numeric_columns] = scaler.fit_transform(data.loc[:, numeric_columns])

# Store the mean and standard deviation values
mean_values = scaler.mean_
std_values = scaler.scale_

print(type(mean_values))
print(type(std_values))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, numeric_columns] = scaler.fit_transform(data.loc[:, numeric_columns])


<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [17]:
print(mean_values[:20])
mean_values.size
print(std_values[:20])
std_values.size

[  81.27801826  125.20496485 1492.62302328   62.1840134    19.37915364
   22.6245401    81.15495026   10.20427343   21.5062732    21.82578027]
[ 18.36526746  31.23571697 530.76228503  45.63638498  19.48075114
  16.10097431  11.5088621   11.42887101  12.51006195  12.54795461]


10

In [18]:
data.head()

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type
9,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau
10,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau
11,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau
12,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau
13,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,stau


In [19]:
data.to_csv(files_dir + '/normalized_combined_data_with_road_type.csv', index=False)

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2542592 entries, 9 to 2543292
Data columns (total 11 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   Engine Coolant Temperature [°C]            float64
 1   Intake Manifold Absolute Pressure [kPa]    float64
 2   Engine RPM [RPM]                           float64
 3   Vehicle Speed Sensor [km/h]                float64
 4   Intake Air Temperature [°C]                float64
 5   Air Flow Rate from Mass Flow Sensor [g/s]  float64
 6   Absolute Throttle Position [%]             float64
 7   Ambient Air Temperature [°C]               float64
 8   Accelerator Pedal Position D [%]           float64
 9   Accelerator Pedal Position E [%]           float64
 10  road_type                                  object 
dtypes: float64(10), object(1)
memory usage: 232.8+ MB


In [21]:
# One-hot encode the 'road_type' column
encoder = OneHotEncoder()  # Create an instance of OneHotEncoder
encoded_road_type = encoder.fit_transform(data[['road_type']])  # Fit and transform the 'road_type' column
feature_names = encoder.get_feature_names_out(['road_type'])  # Get the feature names from the encoder
encoded_df = pd.DataFrame(encoded_road_type.toarray(), columns=feature_names)  # Create a DataFrame from the encoded data with the feature names
data = pd.concat([data.drop('road_type', axis=1), encoded_df], axis=1)

In [22]:
data.head()

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type_frei,road_type_normal,road_type_stau
9,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0
10,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0
11,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0
12,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0
13,-2.737669,-0.934986,-2.812225,-1.362597,0.134535,-1.348648,0.681653,0.944601,-0.592025,-0.583823,0.0,0.0,1.0


In [23]:
data.to_csv(output_dir + '/one_hot_encoded_normalized_combined_data_with_road_type.csv', index=False)

In [24]:
data.isna().sum()

Engine Coolant Temperature [°C]              701
Intake Manifold Absolute Pressure [kPa]      701
Engine RPM [RPM]                             701
Vehicle Speed Sensor [km/h]                  701
Intake Air Temperature [°C]                  701
Air Flow Rate from Mass Flow Sensor [g/s]    701
Absolute Throttle Position [%]               701
Ambient Air Temperature [°C]                 701
Accelerator Pedal Position D [%]             701
Accelerator Pedal Position E [%]             701
road_type_frei                               701
road_type_normal                             701
road_type_stau                               701
dtype: int64

In [25]:
data = data.dropna()

In [26]:
# Split the data
train, test = train_test_split(data, test_size=0.2)
train.to_csv(output_dir + '/train_data.csv', index=False)
test.to_csv(output_dir + '/test_data.csv', index=False)

In [27]:
'''
# KMeans clustering
n_clusters = 5  # can choose a different number based on domain knowledge or use methods like the Elbow method
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(data)
labels = kmeans.labels_

# Add cluster labels to original data
df["Cluster"] = labels

# Analyze clusters
for i in range(n_clusters):
    print(f"Cluster {i}:")
    print(df[df["Cluster"] == i].describe())
    print("--------------------")

# Visualization - just an example, we can visualize more features
for i in range(n_clusters):
    cluster_data = df[df["Cluster"] == i]
    plt.scatter(cluster_data["Engine RPM [RPM]"], cluster_data["Vehicle Speed"])
    plt.title(f"Cluster {i}")
    plt.xlabel("Engine RPM [RPM]")
    plt.ylabel("Vehicle Speed")
    plt.show()
    '''

'\n# KMeans clustering\nn_clusters = 5  # can choose a different number based on domain knowledge or use methods like the Elbow method\nkmeans = KMeans(n_clusters=n_clusters)\nkmeans.fit(data)\nlabels = kmeans.labels_\n\n# Add cluster labels to original data\ndf["Cluster"] = labels\n\n# Analyze clusters\nfor i in range(n_clusters):\n    print(f"Cluster {i}:")\n    print(df[df["Cluster"] == i].describe())\n    print("--------------------")\n\n# Visualization - just an example, you can visualize more features\nfor i in range(n_clusters):\n    cluster_data = df[df["Cluster"] == i]\n    plt.scatter(cluster_data["Engine RPM [RPM]"], cluster_data["Vehicle Speed"])\n    plt.title(f"Cluster {i}")\n    plt.xlabel("Engine RPM [RPM]")\n    plt.ylabel("Vehicle Speed")\n    plt.show()\n    '

In [28]:
# Train the model
clf = IsolationForest(contamination=0.01)  # we can adjust contamination based on expected percentage of anomalies
labels = clf.fit_predict(data)

# Anomalies are labeled as -1
anomalies = data[labels == -1]



In [29]:
print(type(anomalies))

<class 'pandas.core.frame.DataFrame'>


In [30]:
anomalies.head(10)

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type_frei,road_type_normal,road_type_stau
1136,-2.628768,-0.19865,0.741155,-0.946263,0.750528,0.245666,-4.357942,0.857104,0.910765,-0.392556,0.0,0.0,1.0
1137,-2.628768,-0.19865,0.741155,-0.946263,0.750528,0.245666,-4.357942,0.769606,0.910765,-0.392556,0.0,0.0,1.0
1138,-2.628768,-0.19865,0.741155,-0.946263,0.750528,0.245666,-4.357942,0.769606,0.319241,-0.392556,0.0,0.0,1.0
1139,-2.628768,-0.19865,0.741155,-0.946263,0.750528,0.245666,-4.357942,0.769606,0.319241,0.420325,0.0,0.0,1.0
1140,-2.628768,-0.19865,0.741155,-0.946263,0.750528,0.245666,-4.357942,0.769606,0.319241,0.420325,0.0,0.0,1.0
1141,-2.628768,-0.967001,0.741155,-0.946263,0.750528,0.245666,-4.357942,0.769606,0.319241,0.420325,0.0,0.0,1.0
1142,-2.628768,-0.967001,-0.577703,-0.946263,0.750528,0.245666,-4.357942,0.769606,0.319241,0.420325,0.0,0.0,1.0
1143,-2.628768,-0.967001,-0.577703,-0.92435,0.750528,0.245666,-4.357942,0.769606,0.319241,0.420325,0.0,0.0,1.0
1144,-2.628768,-0.967001,-0.577703,-0.92435,0.750528,0.245666,-4.357942,0.769606,0.319241,0.420325,0.0,0.0,1.0
2311,-1.97536,2.650653,0.238105,-0.398454,0.59653,1.880971,0.20376,0.682108,2.701324,2.731459,0.0,0.0,1.0


In [38]:
columns_to_denormalize = anomalies.columns[:10]  # Select the first 10 columns

subset_data = anomalies[columns_to_denormalize]  # Create a subset DataFrame with only the columns to denormalize


scaler = StandardScaler()  # Create a scaler object

# Set the mean and standard deviation values
scaler.mean_ = mean_values[:10]
scaler.scale_ = std_values[:10]

# Denormalize the subset of data
denormalized_subset = pd.DataFrame(scaler.inverse_transform(subset_data), columns=columns_to_denormalize, index=subset_data.index)

# Combine the denormalized subset with the remaining columns
Final_data = pd.concat([denormalized_subset, subset_data_2], axis=1)

In [39]:
Final_data.head()

Unnamed: 0,Engine Coolant Temperature [°C],Intake Manifold Absolute Pressure [kPa],Engine RPM [RPM],Vehicle Speed Sensor [km/h],Intake Air Temperature [°C],Air Flow Rate from Mass Flow Sensor [g/s],Absolute Throttle Position [%],Ambient Air Temperature [°C],Accelerator Pedal Position D [%],Accelerator Pedal Position E [%],road_type_frei,road_type_normal,road_type_stau
1136,33.0,119.0,1886.0,19.0,34.0,26.58,31.0,20.0,32.9,16.9,0.0,0.0,1.0
1137,33.0,119.0,1886.0,19.0,34.0,26.58,31.0,19.0,32.9,16.9,0.0,0.0,1.0
1138,33.0,119.0,1886.0,19.0,34.0,26.58,31.0,19.0,25.5,16.9,0.0,0.0,1.0
1139,33.0,119.0,1886.0,19.0,34.0,26.58,31.0,19.0,25.5,27.1,0.0,0.0,1.0
1140,33.0,119.0,1886.0,19.0,34.0,26.58,31.0,19.0,25.5,27.1,0.0,0.0,1.0
