In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import joblib

# Load the dataset
try:
    df = pd.read_csv('data/street_light_data_shuffled.csv')
except FileNotFoundError:
    print("Error: 'street_light_data_shuffled.csv' not found in data directory.")
    exit()

# --- Feature Engineering ---
# Convert timestamp to seconds of the day
# This allows the model to use the time of day as a feature.
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%H:%M:%S').dt.time
df['seconds_of_day'] = df['timestamp'].apply(lambda t: t.hour * 3600 + t.minute * 60 + t.second)


# Use 'ambience_lux' and the new 'seconds_of_day' as features.
X = df[['ambience_lux', 'seconds_of_day']]

# --- Target Variable Preparation ---
# Convert the 'light_needed' column from text ('yes'/'no') to numbers (1/0).
# This is necessary for the logistic regression model.
y = df['light_needed'].map({'yes': 1, 'no': 0})

# --- Data Splitting ---
# Split the data into a training set (70%) and a testing set (30%).
# random_state ensures that we get the same split every time we run the code,
# making our results reproducible.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print(f"Data successfully split:")
print(f" - Training set size: {len(X_train)} samples")
print(f" - Testing set size: {len(X_test)} samples")
print("-" * 30)

# --- Model Training ---
# Initialize the Logistic Regression model.
model = LogisticRegression()

# Train the model using our training data.
print("Training the logistic regression model...")
model.fit(X_train, y_train)
print("Model training complete.")
print("-" * 30)


# --- Save the Model ---
# Save the trained model to a file for later use.
model_filename = 'models/street_light_model.joblib'
joblib.dump(model, model_filename)
print(f"Model has been exported to '{model_filename}'")
print("-" * 30)


# --- Model Evaluation ---
# Use the trained model to make predictions on the unseen test data.
print("Evaluating the model on the test set...")
y_pred = model.predict(X_test)

# Calculate the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("-" * 30)

# --- Detailed Performance Report ---
# Print a more detailed classification report.
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Light Needed', 'Light Needed']))
print("-" * 30)

# --- Visualization of Results ---
# Create a confusion matrix to visualize performance.
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Light Needed', 'Light Needed'],
            yticklabels=['No Light Needed', 'Light Needed'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('confusion_matrix.png')
print("A confusion matrix visualization has been saved as 'confusion_matrix.png'.")

# Plot the test data points with the two features
# This helps visualize how the features separate the two classes.
plt.figure(figsize=(10, 7))
# Create a dataframe for plotting
plot_df = X_test.copy()
plot_df['light_needed'] = y_test
sns.scatterplot(data=plot_df, x='ambience_lux', y='seconds_of_day', hue='light_needed', palette='viridis', alpha=0.7, s=50)
plt.title('Test Data Distribution: Ambience vs. Time of Day')
plt.xlabel('Ambience (Lux)')
plt.ylabel('Seconds of Day')
plt.legend(title='Light Needed', labels=['No', 'Yes'])
plt.grid(True)
plt.savefig('feature_distribution.png')
print("A plot of the feature distribution has been saved as 'feature_distribution.png'.")