<a href="https://colab.research.google.com/github/miguelcasadinho/colab/blob/main/weather_station.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Weather Station Data

Part 1 - Data cleaning

In [1]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

In [46]:
# Import the dataset
dataset = pd.read_csv('weather.csv')

#print(dataset)
print(dataset.shape)

(119372, 9)


In [47]:
# Round to the nearest 5 minutes
def round_to_nearest_5_minutes(timestamp):
    minutes = timestamp.minute
    nearest_5 = 5 * round(minutes / 5)

    if nearest_5 == 60:
        # Verifique se a hora é 23 e, se for, ajuste para o próximo dia
        if timestamp.hour == 23:
            return timestamp.replace(hour=0, minute=0, second=0, microsecond=0) + timedelta(days=1)
        else:
            return timestamp.replace(hour=timestamp.hour + 1, minute=0, second=0, microsecond=0)
    else:
        return timestamp.replace(minute=nearest_5, second=0, microsecond=0)

In [48]:
# Convert 'date' column to datetime object
dataset['date'] = pd.to_datetime(dataset['date'])

#  Round to nearest 5 minutes
dataset['date'] = dataset['date'].apply(round_to_nearest_5_minutes)

print(dataset.shape)
#print(dataset)

(119372, 9)


In [49]:
# Handling rows with NaN values
# Identify rows with NaN values
dataset_rows_with_nan = dataset[dataset.isnull().any(axis=1)]
#print("Rows with NaN in dataset:", dataset_rows_with_nan)

# Deleting rows with NaN values
dataset.dropna(inplace=True)

print(dataset.shape)

(119372, 9)


In [50]:
# Handling duplicates timestamps
dataset_duplicate_rows = dataset[dataset.duplicated(subset=['date'])]
print("Number of duplicate rows in dataset:", len(dataset_duplicate_rows))
#print(dataset_duplicate_rows)
# Remove the duplicate rows based on Time:
dataset.drop_duplicates(subset=['date'], inplace=True)

print(dataset.shape)

Number of duplicate rows in dataset: 34
(119338, 9)


In [51]:
# Outlier detection and treatment using Z-score
from scipy.stats import zscore

# Set the Z-score threshold (e.g., 5 standard deviations)
threshold = 5

# Calculate Z-scores for light_intensity
z_scores_dataset = zscore(dataset['light_intensity'])
# Identify outliers based on Z-score threshold
dataset_outliers =  np.where(np.abs(z_scores_dataset) > threshold)
#print("Outliers identified in dataset:", dataset.iloc[dataset_outliers[0]])
# Replace outliers with NaN
dataset.loc[dataset_outliers[0], 'light_intensity'] = np.nan
# Interpolate to fill NaN values (default is linear, but you can specify other methods)
dataset['light_intensity'] = dataset['light_intensity'].interpolate(method='linear')
# Fill any remaining NaNs if needed
dataset['light_intensity'] = dataset['light_intensity'].ffill() # or use 'bfill', 'mean', etc.

# Calculate Z-scores for uv_index
z_scores_dataset = zscore(dataset['uv_index'])
# Identify outliers based on Z-score threshold
dataset_outliers =  np.where(np.abs(z_scores_dataset) > threshold)
#print("Outliers identified in dataset:", dataset.iloc[dataset_outliers[0]])
# Replace outliers with NaN
dataset.loc[dataset_outliers[0], 'uv_index'] = np.nan
# Interpolate to fill NaN values (default is linear, but you can specify other methods)
dataset['uv_index'] = dataset['uv_index'].interpolate(method='linear')
# Fill any remaining NaNs if needed
dataset['uv_index'] = dataset['uv_index'].ffill() # or use 'bfill', 'mean', etc.

# Remove outliers from the dataset
#dataset = dataset[(np.abs(z_scores_dataset) <= threshold)]

print(dataset.shape)

(119338, 9)


In [52]:
# Find gaps in the timedata
# Create a complete time index from min to max with 5-minute frequency
dataset_complete_time_index = pd.date_range(start=dataset['date'].min(), end=dataset['date'].max(), freq='5min')

# Reindex the dataset with the complete time index
dataset = dataset.set_index('date').reindex(dataset_complete_time_index)

# Reset index to bring 'date' back as a column
dataset.reset_index(inplace=True)
dataset.rename(columns={'index': 'date'}, inplace=True)

# Fill the NaN values using forward fill and linear interpolation
dataset['air_temperature'] = dataset['air_temperature'].interpolate(method='linear')
dataset['air_temperature'] = dataset['air_temperature'].ffill()
dataset['air_humidity'] = dataset['air_humidity'].interpolate(method='linear')
dataset['air_humidity'] = dataset['air_humidity'].ffill()
dataset['light_intensity'] = dataset['light_intensity'].interpolate(method='linear')
dataset['light_intensity'] = dataset['light_intensity'].ffill()
dataset['uv_index'] = dataset['uv_index'].interpolate(method='linear')
dataset['uv_index'] = dataset['uv_index'].ffill()
dataset['wind_speed'] = dataset['wind_speed'].interpolate(method='linear')
dataset['wind_speed'] = dataset['wind_speed'].ffill()
dataset['wind_direction'] = dataset['wind_direction'].interpolate(method='linear')
dataset['wind_direction'] = dataset['wind_direction'].ffill()
dataset['rain_gauge'] = dataset['rain_gauge'].interpolate(method='linear')
dataset['rain_gauge'] = dataset['rain_gauge'].ffill()
dataset['barometric_pressure'] = dataset['barometric_pressure'].interpolate(method='linear')
dataset['barometric_pressure'] = dataset['barometric_pressure'].ffill()

print(dataset.shape)


(120879, 9)


In [53]:
# Set time as index
dataset.set_index('date', inplace=True)
values = dataset[['air_temperature','air_humidity','light_intensity','uv_index','wind_speed','wind_direction','rain_gauge','barometric_pressure']].values
print(dataset.shape)

(120879, 8)
