In [None]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sys.path.append('../')

from cool_train.data_loading import correct_column_types

# df = pd.read_csv('../dataset/data_cleaned.csv', index_col=[0])
df = pd.read_csv('../dataset/train_181/train_181.csv',sep=';')

df = correct_column_types(df)
df


In [None]:
selected_cols = ['RS_E_InAirTemp_PC1', 'RS_E_InAirTemp_PC2',
                 'RS_E_OilPress_PC1','RS_E_OilPress_PC2',
                 'RS_E_RPM_PC1','RS_E_RPM_PC2',
                 'RS_E_WatTemp_PC1','RS_E_WatTemp_PC2',
                 'RS_T_OilTemp_PC1','RS_T_OilTemp_PC2']

correlation_matrix  = df[selected_cols].corr()
correlation_matrix

In [None]:
# Get the top 10 correlated variable pairs
top_correlations = correlation_matrix.unstack().sort_values(ascending=False).drop_duplicates()[1:20]

# Display the result
print("Top 10 correlated variable pairs:")
print(top_correlations)

In [None]:
# Create a beautiful histogram for RS_E_RPM_PC2 and RS_E_RPM_PC1
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='RS_E_RPM_PC2', bins=50, kde=True, color='skyblue', label='RS_E_RPM_PC2')
sns.histplot(data=df, x='RS_E_RPM_PC1', bins=50, kde=True, color='salmon', label='RS_E_RPM_PC1')

plt.title('Histogram of RS_E_RPM_PC2 and RS_E_RPM_PC1')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

col1 = 'RS_E_RPM_PC2'
col2 = 'RS_E_RPM_PC1'

# Scatter plot
plt.figure(figsize=(8, 6))  # Adjust the figure size as needed

plt.scatter(df[col1], df[col2], color='blue', marker='o', label='Data Points')

plt.title(f'Scatter Plot of {col1} vs {col2}')
plt.xlabel(col1)
plt.ylabel(col2)
plt.legend()  # Display legend if you have multiple datasets

plt.grid(True)
plt.show()


# Kernel Density Plot
This takes quite some time

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Scatter plot
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed

# Scatter plot
plt.subplot(1, 2, 1)
plt.scatter(df[col1], df[col2], color='blue', marker='o', label='Data Points')
plt.title(f'Scatter Plot of {col1} vs {col2}')
plt.xlabel(col1)
plt.ylabel(col2)
plt.grid(True)
plt.legend()

# 2D Kernel Density Estimation
plt.subplot(1, 2, 2)
sns.kdeplot(x=df[col1], y=df[col2], cmap='viridis', fill=True)
plt.title('2D Kernel Density Estimation')
plt.xlabel(col1)
plt.ylabel(col2)

plt.grid(True)
plt.tight_layout()  # Adjust layout for better presentation
plt.show()


# Plotly

In [None]:
import plotly.express as px
import pandas as pd

# Scatter plot using Plotly Express
fig = px.scatter(df, x=col1, y=col2, hover_data=[col1, col2, 'timestamps_UTC'],
                 labels={'timestamps_local': 'Timestamp '},
                 title=f'Scatter Plot of {col1} vs {col2}')

fig.update_layout(width=800, height=800)  # Adjust figure size
fig.show()

## Basic Clutering of RPM phases
- From the scatter plot of RMP1 vs RMP2, I saw different patterns:
  - Highest correlation
  - Vitesse nominale: 800 RPM (voir 2 histograms)
  - Bcp de valeurs proches de 0 pour l'un ou l'autre
  - Phases de demarrage (transition) ou RPMs <650
  - Phase de marche ou RPMs > 650
  - 2 points inhabituels
  - A tester : le meme graphe pour un autre train


- The goal of this subpart is to plot the different phases on the time series 
- We expect:
    - phase_demarrage to be in the start of a run
    - phase_marche to be in the middle of a run
    - phase_zero to be ?

In [None]:
th_marche = 750

# Define clustering conditions
condition2 = (df[col2] >= th_marche) & (df[col1] >= th_marche)
condition1 = (df[col2] < th_marche) & (df[col1] < th_marche)
condition3 = (df[col2] == 0) | (df[col1] == 0)


# Create a new column 'cluster' based on conditions
df['cluster'] = 'Other'
df.loc[condition3, 'cluster'] = 'phase_zero'
df.loc[condition1, 'cluster'] = 'phase_demarrage'
df.loc[condition2, 'cluster'] = 'phase_marche'


In [None]:
df['cluster'].value_counts()

Show it in a scatter plot

In [None]:
# Show the scatter plot with colored clusters
plt.scatter(df.loc[condition1, 'RS_E_RPM_PC2'], df.loc[condition1, 'RS_E_RPM_PC1'], color='orange', marker='o', label='phase_demarrage', s=5)
plt.scatter(df.loc[condition2, 'RS_E_RPM_PC2'], df.loc[condition2, 'RS_E_RPM_PC1'], color='green', marker='o', label='phase_marche', s=5)
plt.scatter(df.loc[condition3, 'RS_E_RPM_PC2'], df.loc[condition3, 'RS_E_RPM_PC1'], color='red', marker='o', label='phase_zero', s=5)
plt.scatter(df[df['cluster']=='Other']['RS_E_RPM_PC2'], df[df['cluster']=='Other']['RS_E_RPM_PC1'], color='black', marker='o', label='Other', s=5)



plt.title('Scatter Plot with Phases')
plt.xlabel(col1)
plt.ylabel(col2)
plt.legend()
plt.grid(True)
plt.show()

## Show the timeseries

In [None]:
df = df.sort_values(by='timestamps_UTC')

In [None]:
import plotly.graph_objects as go
import datetime

# Variable to plot
var = 'RS_E_RPM_PC2'

# Filter the DataFrame for the first two months
start_date = datetime.datetime(2023,2,1)
end_date = datetime.datetime(2023,3,15)

# Filter rows between the specified dates
df_to_plot = df[df['timestamps_UTC'].between(start_date, end_date)]

# Create a time series plot with color-coded points based on the 'cluster' column
fig = go.Figure()

for cluster, color in zip(['phase_demarrage', 'Other', 'phase_marche', 'phase_zero'], ['orange', 'black','green','red']):
    cluster_data = df_to_plot[df_to_plot['cluster'] == cluster]
    fig.add_trace(go.Scatter(x=cluster_data['timestamps_UTC'], y=cluster_data[var],
                             mode='markers', marker=dict(color=color), name=cluster))

fig.update_layout(title='Time Series Plot with Cluster Colors (Subset)',
                  xaxis_title='Date', yaxis_title=var)

fig.show()

## KDE of cluster vs Hour of the Day
- Is the phase related to the hour of the day? We expect the transition phases to be in the morning or the evening, and the runtime/normal phase to be during the full day

In [None]:
df['hour_day'] = df['timestamps_UTC'].apply(lambda x:x.hour)

In [None]:
# Variable to plot
var = 'RS_E_RPM_PC2'

# Filter the DataFrame for the first two months
start_date = datetime.datetime(2023,2,1)
end_date = datetime.datetime(2023,3,15)

# Filter rows between the specified dates
df_to_plot = df[df['timestamps_UTC'].between(start_date, end_date)]

# Create density plots for each cluster
plt.figure(figsize=(10, 6))
for cluster, color in zip(['phase_demarrage', 'Other', 'phase_marche', 'phase_zero'], ['orange', 'black','green','red']):
    cluster_data = df_to_plot[df_to_plot['cluster'] == cluster]
    #sns.kdeplot(data=cluster_data, x='hour_day', label=f'Cluster {cluster}', color=color, fill=True)
    sns.histplot(data=cluster_data, x='hour_day', bins=24, kde=True, stat='count', label=f'Cluster {cluster}', color=color)

plt.title('Density Function for Each Cluster (Hour of the Day)')
plt.xlabel('Hour of the Day')
plt.ylabel('Density')
plt.legend()
plt.show()

-> No clear link with the hour of the day