### EDA Lecture - Finals Assignment

#### Temperature Sensor Data Outlier Detection

##### Imports

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

##### Load & prepare the data

In [None]:
df_set_temp = pd.read_csv(filepath_or_buffer='./Data/Setpoint_LazienkaGorna.csv')
df_measured_temp = pd.read_csv(filepath_or_buffer='./Data/Temperatura_LazienkaGorna.csv')
df_outside_temp = pd.read_csv(filepath_or_buffer='./Data/TemperaturaZewnetrzna.csv')

In [None]:
print(df_set_temp.head())
print(df_measured_temp.head())
print(df_outside_temp.head())

In [None]:
# Convert time to human-readable datetime

df_set_temp['time'] = pd.to_datetime(df_set_temp['time'])
df_set_temp.head()

In [None]:
df_measured_temp['time'] = pd.to_datetime(df_set_temp['time'])
df_outside_temp['time'] = pd.to_datetime(df_set_temp['time'])

##### Visualize the data

In [None]:
fix, axes = plt.subplots(nrows=3, ncols=1, figsize=(15, 5))

axes[0].plot(df_set_temp['time'], df_set_temp['value'], label='Set Temp')
axes[0].set_title('Set Temp')
axes[0].set_xlabel('Time')
axes[0].set_ylabel('Value')
axes[0].legend()

axes[1].plot(df_measured_temp['time'], df_measured_temp['value'], label='Measured Temp')
axes[1].set_title('Measured Temp')
axes[1].set_xlabel('Time')
axes[1].set_ylabel('Value')
axes[1].legend()

axes[2].plot(df_outside_temp['time'], df_outside_temp['value'], label='Outside Temp')
axes[2].set_title('Outside Temp')
axes[2].set_xlabel('Time')
axes[2].set_ylabel('Value')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Redraw the plots with a smaller timeframe

fix, axes = plt.subplots(nrows=3, ncols=1, figsize=(15, 5))

axes[0].plot(df_set_temp['time'][:8000], df_set_temp['value'][:8000], label='Set Temp')
axes[0].set_title('Set Temp')
axes[0].set_xlabel('Time')
axes[0].set_ylabel('Value')
axes[0].legend()

axes[1].plot(df_measured_temp['time'][:8000], df_measured_temp['value'][:8000], label='Measured Temp')
axes[1].set_title('Measured Temp')
axes[1].set_xlabel('Time')
axes[1].set_ylabel('Value')
axes[1].legend()

axes[2].plot(df_outside_temp['time'][:8000], df_outside_temp['value'][:8000], label='Outside Temp')
axes[2].set_title('Outside Temp')
axes[2].set_xlabel('Time')
axes[2].set_ylabel('Value')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Find the standard deviation value for the dataset values

set_temp_deviation = df_set_temp['value'].std()
print(f'Set temp deviation: {set_temp_deviation}')

measured_temp_deviation = df_measured_temp['value'].std()
print(f'Measured temp deviation: {measured_temp_deviation}')

outside_temp_deviation = df_outside_temp['value'].std()
print(f'Outside temp deviation: {outside_temp_deviation}')

##### Find min / max values from the available data

In [None]:
df_sets = [df_set_temp, df_measured_temp, df_outside_temp]

for dataset in df_sets:
    print(f'Data: {dataset["name"][0]}')
    print(f'MIN: {dataset["value"].min()}')
    print(f'MAX: {dataset["value"].max()}')
    print(f'Median: {dataset["value"].median()}')
    print(f'Mean: {dataset["value"].mean()}')
    print()

In [None]:
def calculate_outlier_threshold(data):
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return lower_bound, upper_bound

In [None]:
lower_bound, upper_bound = calculate_outlier_threshold(df_set_temp['value'])
print(f'Outlier Threshold for the set temp: below {lower_bound} or above {upper_bound}')

lower_bound, upper_bound = calculate_outlier_threshold(df_measured_temp['value'])
print(f'Outlier Threshold for the measured temp: below {lower_bound} or above {upper_bound}')

lower_bound, upper_bound = calculate_outlier_threshold(df_outside_temp['value'])
print(f'Outlier Threshold for the outside temp: below {lower_bound} or above {upper_bound}')

##### Look for outliers in data

In [None]:
from scipy.stats import zscore

df_measured_temp['z_score'] = zscore(df_measured_temp['value'])
anomalies = df_measured_temp[abs(df_measured_temp['z_score']) > 3]

In [None]:
plt.figure(figsize=(20, 8))
plt.plot(df_measured_temp['time'], df_measured_temp['value'])
plt.scatter(anomalies['time'], anomalies['value'], color='red', label='Anomalies')
plt.legend()
plt.show()

In [None]:
q1 = df_measured_temp['value'].quantile(0.25)
q3 = df_measured_temp['value'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

anomalies = df_measured_temp[(df_measured_temp['value'] < lower_bound) | (df_measured_temp['value'] > upper_bound)]

In [None]:
plt.figure(figsize=(20, 8))
plt.plot(df_measured_temp['time'], df_measured_temp['value'])
plt.scatter(anomalies['time'], anomalies['value'], color='red', label='Anomalies')
plt.legend()
plt.show()

In [None]:
from sklearn.ensemble import IsolationForest

model = IsolationForest(contamination=0.05)
df_measured_temp['anomaly'] = model.fit_predict(df_measured_temp[['value']])
anomalies = df_measured_temp[df_measured_temp['anomaly'] == -1]

In [None]:
plt.figure(figsize=(20, 8))
plt.plot(df_measured_temp['time'], df_measured_temp['value'])
plt.scatter(anomalies['time'], anomalies['value'], color='red', label='Anomalies')
plt.legend()
plt.show()