## EDA Exam - Term 3

### Motion sensor analysis

#### Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta
from matplotlib.dates import DateFormatter

In [None]:
# Set display options to show all rows and columns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#### Loading the data

In [None]:
df = pd.read_csv(filepath_or_buffer='./Data/Swiatlo_Schody.csv')
df.head()

In [None]:
# Convert the UNIX time format

df['time'] = pd.to_datetime(df['time'])
df.head()

#### EDA

In [None]:
# Get the first and last sample date

first_entry = df['time'].min()
last_entry = df['time'].max()

display(first_entry)
display(last_entry)

In [None]:
# Check the daily time range of the samples

df['hour'] = df['time'].dt.hour

df['hour'].describe()

In [None]:
# Check if each day contains 24hrs worth of data

print(df.head(1000))

In [None]:
# Check for any long consecutive active light cycles

# Find consecutive instances where the light is on

df['group'] = (df['value'].shift() != df['value']).cumsum()

# Filter out groups where the value is 1 and duration is longer that the threshold

filtered_groups = df[df['value'] == 1].groupby('group').filter(lambda x: (x['time'].max() - x['time'].min()) >= timedelta(minutes=5))

# Print the cycles

for _, group in filtered_groups.groupby('group'):
    start_time = group['time'].min()
    end_time = group['time'].max()
    duration = end_time - start_time
    print("Cycle:", start_time, "to", end_time, "- Duration:", duration)

#### Plots and data visualization

##### EDA techniques and statistical analysis

##### Average amount of detections per hour

In [None]:
# Filter data for value 1 occurrences

df_active = df[df['value'] == 1]

# Group data by hour and calculate the mean count of detections for each hour

hourly_avg_counts = df_active.groupby('hour').size().div(df['time'].dt.date.nunique())
hourly_avg_counts

##### Activation plots based on daily data

In [None]:
dates_to_plot = ['2020-10-01',
                 '2021-01-01',
                 '2021-03-01',
                 '2021-06-01',
                 '2021-10-01',
                 '2022-01-01',
                 '2022-03-01',
                 '2022-06-01',
                 '2022-10-01',
                 '2023-01-01']

# Create e plot for each date

for specific_date in dates_to_plot:
    # Filter for specific date
    filtered_df = df[df['time'].dt.date == pd.to_datetime(specific_date).date()]

    # Plotting
    plt.figure(figsize=(12, 6))
    plt.plot(filtered_df['time'], filtered_df['value'], marker='o', linestyle='', color='blue', label='Light State')
    plt.yticks([0, 1], ['Off', 'On'])
    plt.title('Light Status on ' + specific_date)
    plt.xlabel('Time (Hours)')
    plt.ylabel('Light Status')
    plt.legend()
    plt.grid(True)
    
    # Set x-axis to cover a 24-hour period
    plt.xlim(pd.to_datetime(specific_date), pd.to_datetime(specific_date) + pd.Timedelta(days=1))
    
    # Format x-axis labels to display only hours
    date_format = DateFormatter("%H")
    plt.gca().xaxis.set_major_formatter(date_format)
    
    plt.show()

##### Time Series Plot

In [None]:
# Group data by the hour and count the occurrences

hour_counts = df.groupby('hour').size()

# Time Series Plot

plt.figure(figsize=(12, 6))
plt.plot(hour_counts.index, hour_counts.values, marker='o', linestyle='-')
plt.title('Time Series of Sensor Activations by Hour')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Activations')
plt.grid(True)
plt.show()

##### Seasonal Time Series Plot

In [None]:
def get_season(timestamp):
    month = timestamp.month
    if 3 <= month <= 5:
        return 'Spring'
    elif 6 <= month <= 8:
        return 'Summer'
    elif 9 <= month <= 11:
        return 'Autumn'
    else:
        return 'Winter'

# Create a new column for the season
    
df['season'] = df['time'].apply(get_season)

# Group data by year, season, and hour, and count the occurrences

seasonal_hour_counts = df.groupby([df['time'].dt.year, 'season', df['time'].dt.hour]).size()

# Create a plot for each season

for (year, season), hour_counts in seasonal_hour_counts.groupby(level=[0, 1]):
    plt.figure(figsize=(12, 6))
    hour_counts.reset_index(level=[0, 1], drop=True, inplace=True)
    plt.plot(hour_counts.index, hour_counts.values, marker='o', linestyle='-', label=season)
    plt.title(f'Time Series of Sensor Activations by Hour for {season} {year}')
    plt.xlabel('Hour of the Day')
    plt.ylabel('Number of Activations')
    plt.legend()
    plt.grid(True)
    plt.show()

#### Model cluster detection

Implement ML models to find grouped data clusters for the motion sensor light activations.

##### K-Means cluster

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Group data by hour and count the occurrences

hour_counts = df.groupby('hour').size().reset_index(name='count')

# Visualize the data

plt.figure(figsize=(10, 6))
plt.plot(hour_counts['hour'], hour_counts['count'], marker='o', linestyle='-')
plt.title('Hourly Sensor Activation')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Activations')
plt.grid(True)
plt.show()

# Perform K-means clustering

X = hour_counts[['hour', 'count']]
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)

# Get cluster centers

cluster_centers = kmeans.cluster_centers_

# Plot clusters

plt.figure(figsize=(10, 6))
plt.plot(hour_counts['hour'], hour_counts['count'], marker='o', linestyle='', label='Data')
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='red', marker='x', label='Cluster Centers')
plt.title('Hourly Sensor Activation with Clusters')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Activations')
plt.legend()
plt.grid(True)
plt.show()

##### OPTICS cluster

In [None]:
from sklearn.cluster import OPTICS
from sklearn.preprocessing import StandardScaler

# Group data by hour and count the occurrences

hour_counts = df.groupby('hour').size().reset_index(name='count')

# Normalize the data

scaler = StandardScaler()
X = scaler.fit_transform(hour_counts[['hour', 'count']])

# Apply the OPTICS clustering

optics = OPTICS(min_samples=2)
optics.fit(X)

# Get cluster labels

labels = optics.labels_

# Plot clusters

plt.figure(figsize=(10, 6))
plt.scatter(hour_counts['hour'], hour_counts['count'], c=labels, cmap='viridis', marker='o')
plt.title('Hourly Sensor Activation with OPTICS Clustering')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Activations')
plt.grid(True)
plt.colorbar(label='Cluster Label')
plt.show()