In [None]:
import pandas as pd
import pytz
import plotly.graph_objects as go
import datetime
import plotly.express as px

In [None]:
# Full Dataset
full_dataset = False

if full_dataset:
    df_raw = pd.read_csv('./dataset/ar41_for_ulb.csv', nrows=100000, sep=';').iloc[:, 1:]
    df_raw = df_raw.rename(columns={'mapped_veh_id':'vehicle_id'})
else: 
    # Train 181
    df_raw = pd.read_csv('./dataset/train_181/train_181.csv', sep=';')
    
df_raw

# Data Cleaning

In [None]:
# Drop duplicates
df = df_raw.drop_duplicates()
df

## Format dtype columns

In [None]:
def correct_column_types(dataframe, column_types):
    """
    Corrects the type of each column in a DataFrame based on the specified types in the dictionary.

    Parameters:
    - dataframe: pandas DataFrame
    - column_types: dictionary where keys are column names and values are the desired types

    Returns:
    - corrected_dataframe: pandas DataFrame with corrected column types
    """
    corrected_dataframe = dataframe.copy()

    for column, desired_type in column_types.items():
        if column in corrected_dataframe.columns:
            # Removing non-numeric columns
            if desired_type != 'datetime64[ns]':
                corrected_dataframe[column] = pd.to_numeric(corrected_dataframe[column], errors='coerce')
                
            # Format right dtype                
            corrected_dataframe[column] = corrected_dataframe[column].astype(desired_type)
                
    return corrected_dataframe

In [None]:
# Format columns
type_dict = {'vehicle_id': 'int32',
             'timestamps_UTC': 'datetime64[ns]', 
             'lat':'float64',
             'lon': 'float64',
             'RS_E_InAirTemp_PC1':'float64',
             'RS_E_InAirTemp_PC2':'float64', 
             'RS_E_OilPress_PC1':'float64',
             'RS_E_OilPress_PC2':'float64', 
             'RS_E_RPM_PC1':'float64',
             'RS_E_RPM_PC2':'float64', 
             'RS_E_WatTemp_PC1':'float64',
             'RS_E_WatTemp_PC2':'float64', 
             'RS_T_OilTemp_PC1':'float64', 
             'RS_T_OilTemp_PC2':'float64'}

df_corrected = correct_column_types(df, type_dict)
df_corrected

In [None]:
# Remove NaNs
df_corrected = df_corrected.dropna()
df_corrected

In [None]:
# Add extra column with local time (UTC+2)
brussels_tz = pytz.timezone('Europe/Brussels')
df_corrected['timestamps_UTC'] = pd.to_datetime(df_corrected['timestamps_UTC'], utc=True)
df_corrected['timestamps_local'] = df_corrected['timestamps_UTC'].dt.tz_convert(brussels_tz)

In [None]:
# Uniform Resampling
resmapling_period = '5T'
df_corrected = df_corrected.set_index('timestamps_local').resample(resmapling_period).mean().reset_index()
df_corrected

In [None]:
df_corrected.sort_values(by='timestamps_local')

# Data Exploration

In [None]:
df.columns 

In [None]:
df_corrected.vehicle_id.unique()

# Visualizations

In [None]:
df_corrected.RS_E_InAirTemp_PC1.hist(bins=30)

In [None]:
df_181 = df_corrected[df_corrected['vehicle_id'] == 181].sort_values(by='timestamps_UTC')
df_181

In [None]:
df_corrected.lon.hist(bins=30)

## Longitude vs Latitude

In [175]:
# Create a scatter plot
fig = go.Figure()

# Add scatter trace to the figure
fig.add_trace(go.Scatter(
    x=df_corrected['lon'],
    y=df_corrected['lat'],
    mode='markers',
    marker=dict(size=6),
    name='',
))

cities = [
    ['Brussels',50.85045,4.34878 ],
    ['Gand',51.05,3.71667],
    ['Antwerpen',51.21989,4.40346],
    ['Namur',50.4669, 4.86746],
    ['Charleroi',50.41136,4.44448],
    ['Liege',50.63373, 5.56749],
    ['Hasselt',50.93106, 5.33781]
    
]
colors = px.colors.qualitative.Plotly

for jj,city in enumerate(cities):

    fig.add_trace(go.Scatter(
        x=[city[2]],  # Actual latitude of Brussels,
        y=[city[1]],  # Actual longitude of Brussels
        mode='markers',
        marker=dict(size=15, color=colors[jj+1]),  # Customize marker appearance
        name=city[0],
        text=[city[0]],  # Text to display (name)
        textposition='top center',  # Position of the text
    ))

# Update layout
fig.update_layout(
    title='Latitude vs Longitude',
    xaxis=dict(title='Longitude'),
    yaxis=dict(title='Lattitude'),  # Reverse the y-axis
)

# Show the Plotly Graph Objects figure
fig.show()

In [None]:
# Set the center coordinates and zoom level
center_lat = df_corrected['lat'].mean() # Latitude of the center point
center_lon = df_corrected['lon'].mean()  # Longitude of the center point
zoom_level = 1  # Adjust the zoom level as needed

fig = px.scatter_geo(df_corrected, lat="lat", lon="lon")

fig.update_layout(
        title = 'Train trajectories in Belgium',
        geo_scope='europe',
    )
# Customize marker size
fig.update_traces(marker=dict(size=10))

fig.show()

### Plot Variables
Basic plot for selected variable(s) and mapped_veh_id

In [None]:
df_corrected.columns

In [174]:
# Select Data
cols_to_plot = ['RS_E_InAirTemp_PC1','RS_E_InAirTemp_PC2']

# Select vehicle
df_to_plot = df_corrected[df_corrected['vehicle_id'] == 181].sort_values(by='timestamps_local')
df_to_plot = df_to_plot[df_to_plot['cluster'] ==1]

# Select range
start_date = pd.Timestamp('2023-01-01',tz='Europe/Brussels')
end_date = pd.Timestamp('2023-08-30',tz='Europe/Brussels')

# Filter rows between the specified dates
df_to_plot = df_to_plot[df_to_plot['timestamps_local'].between(start_date, end_date)]

# Plotly Figure
fig = go.Figure()
colors = px.colors.qualitative.Plotly

# Add line plot for each variable
line_flag = True 
if line_flag:
    for jj,column in enumerate(cols_to_plot):
        fig.add_trace(go.Scatter(x=df_to_plot['timestamps_local'], y=df_to_plot[column], mode='lines', name=column, marker_color=colors[jj]))

# Add point for each variable (takes more memory)
points_flag = False
if points_flag:
    # Add point plot for each variable
    for jj,col in enumerate(cols_to_plot):
        fig.add_traces(go.Scatter(x=df_to_plot['timestamps_local'], y=df_to_plot[col],
                                textposition='top left',
                                textfont=dict(color='#233a77'),
                                mode='markers+text',name=column, 
                                marker=dict(color=colors[jj], size=6)))

fig.update_layout({"title": 'General Plot',
                   "xaxis": {"title":"Time"},
                   "yaxis": {"title":"Variable"},
                   "showlegend": True})

fig.show()

In [None]:
df_to_plot

In [None]:
df_181.plot(x='timestamps_UTC',y=['RS_E_InAirTemp_PC1','RS_E_InAirTemp_PC2'],rot=45,subplots=False)

In [None]:
# Feature engineering
'''
- speed of the train
- is/is not in workshop
- inbound and outbound trajectories
- uphill/downhillles
- use ARMA or ARIMA/prophet model to predict the next values -> select anomalies 
- weather
'''

## Test: Find the train trajectories with DBSCAN
Here I want to search for an automatic method to detect, for each point, to which trajectory the point belong to.
Clustering methods tried:
- DBSCAN: some trajectories can be found. But not perfectly correct
- KNN: do not find the good trajectories
- OPTICS: do not work

In [None]:
df_corrected = df_corrected.dropna(subset=['lon','lat'])
df_corrected

In [None]:
from sklearn.neighbors import NearestNeighbors
import plotly.express as px

# Create features for clustering
X = df_corrected[['lon', 'lat']]

neighbors = 6
# X_embedded is your data
nbrs = NearestNeighbors(n_neighbors=neighbors ).fit(X)
distances, indices = nbrs.kneighbors(X)
distance_desc = sorted(distances[:,neighbors-1], reverse=True)
px.line(x=list(range(1,len(distance_desc )+1)),y= distance_desc )

--> Knee-point is around y = 0.025

In [None]:
from sklearn.cluster import DBSCAN

# Create features for clustering
X = df_corrected[['lon', 'lat']]

# Perform DBSCAN clustering
epsilon = 0.025 # Radius for neighborhood
min_samples = 10  # Minimum number of samples in a cluster
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples)
df_corrected['cluster'] = dbscan.fit_predict(X)
df_corrected['cluster'].value_counts()

In [170]:
# Test: find
import matplotlib.pyplot as plt 
import plotly.express as px

# Create a scatter plot

# Create the scatter plot using Plotly Express
fig = px.scatter(df_corrected, x='lat', y='lon',title='Latitude vs Longitude',color='cluster')

# Customize the marker size (you can adjust the 'size' parameter)
fig.update_traces(marker=dict(size=6))

In [171]:
df_corrected[df_corrected['cluster']==1]

Unnamed: 0,timestamps_local,id,vehicle_id,lat,lon,RS_E_InAirTemp_PC1,RS_E_InAirTemp_PC2,RS_E_OilPress_PC1,RS_E_OilPress_PC2,RS_E_RPM_PC1,...,RS_E_WatTemp_PC2,RS_T_OilTemp_PC1,RS_T_OilTemp_PC2,Sensors_NB_Fail,OutOfRange_NB,IsInvalidLine,IsTreated,ID_Org,mapped_veh_id_Org,cluster
59396,2023-08-17 16:05:00+02:00,1.093224e+07,181.0,50.940160,5.290017,42.000000,36.600000,265.900000,240.800000,939.600000,...,83.800000,82.200000,86.400000,0.0,0.0,0.0,0.0,1.085430e+07,181.0,1
59397,2023-08-17 16:10:00+02:00,8.447037e+06,181.0,50.937517,5.299740,45.111111,38.333333,129.111111,125.555556,446.444444,...,87.333333,81.222222,86.111111,0.0,0.0,0.0,0.0,8.404084e+06,181.0,1
59671,2023-08-18 15:00:00+02:00,8.720115e+06,181.0,50.936759,5.302489,26.500000,24.500000,558.250000,557.875000,800.125000,...,29.500000,22.000000,23.750000,0.0,0.0,0.0,0.0,8.684234e+06,181.0,1
59672,2023-08-18 15:05:00+02:00,7.780378e+06,181.0,50.936743,5.302485,28.600000,26.000000,515.200000,517.100000,799.000000,...,38.800000,24.700000,27.000000,0.0,0.0,0.0,0.0,7.864446e+06,181.0,1
59673,2023-08-18 15:10:00+02:00,1.275899e+07,181.0,50.936746,5.302486,30.833333,27.000000,489.500000,480.000000,807.000000,...,44.666667,30.333333,30.333333,0.0,0.0,0.0,0.0,1.268605e+07,181.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61709,2023-08-25 16:50:00+02:00,1.060337e+07,181.0,50.935593,5.308928,37.000000,38.000000,235.000000,207.750000,799.250000,...,87.500000,77.500000,85.000000,0.0,0.0,0.0,0.0,1.055511e+07,181.0,1
61710,2023-08-25 16:55:00+02:00,1.144022e+07,181.0,50.935584,5.308998,37.000000,39.142857,235.714286,207.857143,799.571429,...,87.285714,76.285714,85.428571,0.0,0.0,0.0,0.0,1.133694e+07,181.0,1
61711,2023-08-25 17:00:00+02:00,1.043273e+07,181.0,50.936644,5.304483,37.333333,39.666667,233.333333,206.583333,798.583333,...,85.416667,79.416667,87.083333,0.0,0.0,0.0,0.0,1.044886e+07,181.0,1
61712,2023-08-25 17:05:00+02:00,6.889240e+06,181.0,50.936369,5.309212,34.888889,33.777778,212.666667,184.222222,728.888889,...,77.444444,79.888889,87.333333,0.0,0.0,0.0,0.0,6.815691e+06,181.0,1


In [None]:
fig = px.scatter(df_181, x='timestamps_UTC',y='RS_T_OilTemp_PC1',color='mapped_veh_id')
fig.show()

In [None]:
'''
mapped_veh_id
lat
lon
RS_E_InAirTemp_PC1
RS_E_InAirTemp_PC2
RS_E_OilPress_PC1
RS_E_OilPress_PC2
RS_E_RPM_PC1
RS_E_RPM_PC2
RS_E_WatTemp_PC1
RS_E_WatTemp_PC2
RS_T_OilTemp_PC1
RS_T_OilTemp_PC2
'''

In [None]:
import requests

def get_current_weather(api_key, location):
    # Define the base URL for the OpenWeatherMap API
    base_url = "http://api.openweathermap.org/data/2.5/weather?"

    # Prepare the parameters for the API request
    params = {
        "q": location,
        "appid": api_key,
        "units": "metric",  # You can change this to "imperial" for Fahrenheit
    }

    try:
        # Send a GET request to the OpenWeatherMap API
        response = requests.get(base_url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()

            # Extract relevant weather information
            temperature = data["main"]["temp"]
            weather_description = data["weather"][0]["description"]

            return {
                "temperature": temperature,
                "weather_description": weather_description,
            }
        else:
            return {
                "error": "Failed to fetch weather data. Check your location or API key."
            }

    except Exception as e:
        return {
            "error": str(e)
        }

# Example usage:
if __name__ == "__main__":
    api_key = "028f7fc14dd4849d8a88fcb50968a6b3"
    location = "Brussels"  # Replace with the location you want to check

    weather_data = get_current_weather(api_key, location)

    if "error" in weather_data:
        print("Error:", weather_data["error"])
    else:
        temperature = weather_data["temperature"]
        description = weather_data["weather_description"]
        print(f"Current temperature in {location}: {temperature}°C")
        print(f"Weather: {description}")


In [None]:
import requests
import datetime

def get_historical_weather(api_key, location, timestamp):
    # Calculate the timestamp for 2 months ago
    two_months_ago = datetime.datetime.fromtimestamp(timestamp) - datetime.timedelta(days=60)
    timestamp_two_months_ago = int(two_months_ago.timestamp())

    # Define the base URL for the OpenWeatherMap historical data API
    base_url = "https://api.openweathermap.org/data/3.0/onecall/timemachine?"

    # Prepare the parameters for the API request
    params = {
        "lat": location["lat"],
        "lon": location["lon"],
        "dt": timestamp_two_months_ago,
        "appid": api_key,
        "units": "metric",  # You can change this to "imperial" for Fahrenheit
    }

    try:
        # Send a GET request to the OpenWeatherMap historical data API
        response = requests.get(base_url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()

            # Extract the temperature from the historical data
            temperature = data["hourly"][0]["temp"]

            return {
                "temperature": temperature
            }
        else:
            return {
                "error": "Failed to fetch historical weather data. Check your location or API key."
            }

    except Exception as e:
        return {
            "error": str(e)
        }

# Example usage:
if __name__ == "__main__":
    api_key = "028f7fc14dd4849d8a88fcb50968a6b3"
    location = {
        "lat": 45,  # Replace with the latitude of your location
        "lon": 54,  # Replace with the longitude of your location
    }
    timestamp = 1629986400  # Replace with the timestamp you want to query (2 months ago)

    weather_data = get_historical_weather(api_key, location, timestamp)

    if "error" in weather_data:
        print("Error:", weather_data["error"])
    else:
        temperature = weather_data["temperature"]
        print(f"Temperature at {location} 2 months ago: {temperature}°C")


In [None]:
import requests
import datetime

def get_historical_weather(api_key, location, timestamp):
    # Calculate the timestamp for 2 months ago
    two_months_ago = datetime.datetime.fromtimestamp(timestamp) - datetime.timedelta(days=60)
    two_months_ago_end = two_months_ago + datetime.timedelta(days=1)
    timestamp_two_months_ago = int(two_months_ago.timestamp())
    timestamp_two_months_ago_end = int(two_months_ago_end.timestamp())

    # Define the base URL for the OpenWeatherMap historical data API
    base_url = 'https://history.openweathermap.org/data/2.5/history/city?'
    #base_url = "https://api.openweathermap.org/data/3.0/onecall/timemachine?"

    # Prepare the parameters for the API request
    params = {
        "lat": location["lat"],
        "lon": location["lon"],
        "start": timestamp_two_months_ago,
        "end": timestamp_two_months_ago_end,
        "appid": api_key,
        'type':'hour',
        "units": "metric",  # You can change this to "imperial" for Fahrenheit
    }

    try:
        # Send a GET request to the OpenWeatherMap historical data API
        response = requests.get(base_url, params=params)
        print(response)
        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()

            # Extract the temperature from the historical data
            temperature = data["hourly"][0]["temp"]

            return {
                "temperature": temperature
            }
        else:
            return {
                "error": "Failed to fetch historical weather data. Check your location or API key."
            }

    except Exception as e:
        return {
            "error": str(e)
        }

# Example usage:
if __name__ == "__main__":
    api_key = "028f7fc14dd4849d8a88fcb50968a6b3"
    location = {
        "lat": 45,  # Replace with the latitude of your location
        "lon": 54,  # Replace with the longitude of your location
    }
    timestamp = 1629986400  # Replace with the timestamp you want to query (2 months ago)

    weather_data = get_historical_weather(api_key, location, timestamp)

    if "error" in weather_data:
        print("Error:", weather_data["error"])
    else:
        temperature = weather_data["temperature"]
        print(f"Temperature at {location} 2 months ago: {temperature}°C")


In [None]:
import requests
import datetime

def get_historical_weather(api_key, location, timestamp):
    # Calculate the timestamp for 2 months ago
    two_months_ago = datetime.datetime.fromtimestamp(timestamp) - datetime.timedelta(days=60)
    two_months_ago_end = two_months_ago + datetime.timedelta(days=1)
    timestamp_two_months_ago = int(two_months_ago.timestamp())
    timestamp_two_months_ago_end = int(two_months_ago_end.timestamp())

    # Define the base URL for the OpenWeatherMap historical data API
    base_url = "https://api.openweathermap.org/data/3.0/onecall?"

    # Prepare the parameters for the API request
    params = {
        "lat": location["lat"],
        "lon": location["lon"],
        #"start": timestamp_two_months_ago,
        #"end": timestamp_two_months_ago_end,
        "appid": api_key,
        #'type':'hour',
        "units": "metric",  # You can change this to "imperial" for Fahrenheit
    }

    try:
        # Send a GET request to the OpenWeatherMap historical data API
        response = requests.get(base_url, params=params)
        print(response)
        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()

            # Extract the temperature from the historical data
            temperature = data["hourly"][0]["temp"]

            return {
                "temperature": temperature
            }
        else:
            return {
                "error": "Failed to fetch historical weather data. Check your location or API key."
            }

    except Exception as e:
        return {
            "error": str(e)
        }

# Example usage:
if __name__ == "__main__":
    api_key = "028f7fc14dd4849d8a88fcb50968a6b3"
    location = {
        "lat": 45,  # Replace with the latitude of your location
        "lon": 54,  # Replace with the longitude of your location
    }
    timestamp = 1629986400  # Replace with the timestamp you want to query (2 months ago)

    weather_data = get_historical_weather(api_key, location, timestamp)

    if "error" in weather_data:
        print("Error:", weather_data["error"])
    else:
        temperature = weather_data["temperature"]
        print(f"Temperature at {location} 2 months ago: {temperature}°C")


In [None]:
import requests

def get_historical_weather(api_key, location, timestamp):
    # Define the base URL for the Weatherbit historical data API
    base_url = "https://api.weatherbit.io/v2.0/history/hourly"

    # Prepare the parameters for the API request
    params = {
        "lat": location["lat"],
        "lon": location["lon"],
        "start_date": timestamp,  # The date for which you want historical data (YYYY-MM-DD)
        "end_date": timestamp,    # Use the same date for start and end for hourly data
        "key": api_key,
        "units": "M",  # You can change this to "I" for Imperial units (Fahrenheit)
    }

    try:
        # Send a GET request to the Weatherbit historical data API
        response = requests.get(base_url, params=params)

        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()

            # Extract the temperature from the historical data
            temperature = data["data"][0]["temp"]

            return {
                "temperature": temperature
            }
        else:
            return {
                "error": "Failed to fetch historical weather data. Check your location or API key."
            }

    except Exception as e:
        return {
            "error": str(e)
        }

# Example usage:
if __name__ == "__main__":
    api_key = "YOUR_WEATHERBIT_API_KEY"
    location = {
        "lat": 40.7128,  # Replace with the latitude of your location
        "lon": -74.0060,  # Replace with the longitude of your location
    }
    date = "2023-08-15"  # Replace with the date you want to query (YYYY-MM-DD)

    weather_data = get_historical_weather(api_key, location, date)

    if "error" in weather_data:
        print("Error:", weather_data["error"])
    else:
        temperature = weather_data["temperature"]
        print(f"Temperature at {location} on {date}: {temperature}°C")


In [None]:
df_corrected

In [None]:
df_181.set_index('timestamps_UTC', inplace=True)

In [None]:
df_1h = df_181.sort_values(by='timestamps_UTC').resample('0.5H').mean()


In [None]:
import plotly.express as px 
# Plot the resampled data using Plotly
fig = px.line(df_181, x=df_181.index, y=['RS_E_InAirTemp_PC1', 'RS_E_InAirTemp_PC2'], title='Resampled Data Plot')
fig.show()

# Testing Dask

In [None]:
import dask.dataframe as dd
import dask
import pandas as pd

# Define a Dask dataframe from the CSV file
ddf = dd.read_csv('./dataset/ar41_for_ulb.csv')
ddf

In [None]:
import dask.dataframe as dd
import dask
import pandas as pd

# Define a Dask dataframe from the CSV file
ddf = dd.read_csv('./dataset/ar41_for_ulb.csv')

# Convert the 'timestamp' column to datetime
ddf['timestamps_UTC'] = dd.to_datetime(ddf['timestamps_UTC'], unit='s')

# Set the 'timestamp' column as the index
ddf = ddf.set_index('timestamps_UTC')

# Resample the data to an hourly frequency and calculate the mean
ddf_resampled = ddf.resample('H').mean()

# Compute the result (this is where the computation actually happens)
result = dask.compute(ddf_resampled)

# Convert the Dask dataframe to a Pandas dataframe for plotting (optional)
df_resampled = result[0].compute()

# Now you can plot df_resampled using your preferred plotting library (e.g., Matplotlib or Plotly)

In [None]:
ddf.map_partitions(len).compute()

In [None]:
ddf

In [None]:
ddf