In [None]:
"""
environmental_eda.ipynb

Exploratory data analysis of environmental sensor data collected across
five commercial-style poultry barns over 22 consecutive weeks.

Input folder structure expected:
    Sample_Dataset/
    └── Environmental/
          └── *.csv  (one or more CSV files)

The script automatically detects all CSV files in the folder,
loads and combines them, and performs cleaning and EDA accordingly.

Dependencies:
    pip install pandas matplotlib seaborn scipy
"""
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
# =============================================================================
# LOAD — Automatically detect and load all CSV files in the folder
# =============================================================================

INPUT_DIR = os.path.join("Sample_Dataset", "Environmental")

# Find all CSV files in the folder regardless of filename
csv_files = glob.glob(os.path.join(INPUT_DIR, "*.csv"))

if not csv_files:
    print(f"[ERROR] No CSV files found in: {INPUT_DIR}")
else:
    print(f"[INFO] Found {len(csv_files)} CSV file(s):")
    for f in csv_files:
        print(f"  → {os.path.basename(f)}")

# Load and combine all files
dfs = []
for filepath in csv_files:
    df               = pd.read_csv(filepath)
    df["Source_File"] = os.path.basename(filepath)
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)

print(f"\nTotal records loaded : {len(data)}")
print(f"Columns detected     : {list(data.columns)}")
print(f"\nFirst 5 rows:")
print(data.head())

In [None]:
# =============================================================================
# CLEANING — Parse dates, extract room, remove NAs
# =============================================================================

# Parse date column
data["date"] = pd.to_datetime(data["date"], errors="coerce")

# Extract room number from source filename
data["room"] = data["source_file"].str.extract(r"Room(\d+)").astype(int)

# Drop rows where date could not be parsed
before = len(data)
data   = data.dropna(subset=["date"])
print(f"Rows dropped due to unparseable dates: {before - len(data)}")

# Drop fully duplicate rows
before = len(data)
data   = data.drop_duplicates()
print(f"Duplicate rows removed: {before - len(data)}")

# Sort by room and date
data = data.sort_values(["room", "date"]).reset_index(drop=True)

print(f"\nClean dataset: {len(data)} records")
print(f"Date range: {data['date'].min().date()} → {data['date'].max().date()}")
print(f"Rooms detected: {sorted(data['room'].unique())}")
print(f"\nMissing values per column:")
print(data.isnull().sum())

In [None]:
# =============================================================================
# TABLE 3 — Room-level AM and PM mean values for temperature and humidity
# =============================================================================

table3 = data.groupby("room").agg(
    Temp_AM_Min_Mean = ("temp_am_min", "mean"),
    Temp_AM_Max_Mean = ("temp_am_max", "mean"),
    Temp_PM_Mean     = ("temp_pm",     "mean"),
    RH_AM_Mean       = ("rh_am",       "mean"),
    RH_PM_Mean       = ("rh_pm",       "mean")
).round(2)

print("=" * 60)
print("  TABLE 3 — Room-Level AM and PM Means")
print("=" * 60)
print(table3.to_string())
print("=" * 60)

In [None]:
# =============================================================================
# TABLE 4 — Descriptive statistics by room
# =============================================================================

for col, label in [
    ("temp_am_min", "Temperature AM Min"),
    ("temp_am_max", "Temperature AM Max"),
    ("temp_pm",     "Temperature PM"),
    ("rh_am",       "Relative Humidity AM"),
    ("rh_pm",       "Relative Humidity PM")
]:
    table4 = data.groupby("room")[col].agg(
        Mean = "mean",
        Std  = "std",
        Min  = "min",
        Max  = "max"
    ).round(2)

    print("=" * 60)
    print(f"  TABLE 4 — {label} by Room")
    print("=" * 60)
    print(table4.to_string())
    print()

In [None]:
# =============================================================================
# TABLE 5 — Aggregated descriptive statistics across all rooms
# =============================================================================

table5 = data[["temp_am_min", "temp_am_max", "temp_pm", "rh_am", "rh_pm"]].agg(
    ["mean", "std", "min", "max"]
).round(2)

table5.columns = ["Temp AM Min", "Temp AM Max", "Temp PM", "RH AM", "RH PM"]

print("=" * 60)
print("  TABLE 5 — Aggregated Statistics Across All Rooms")
print("=" * 60)
print(table5.to_string())
print("=" * 60)

In [None]:
# =============================================================================
# FIGURE — Temperature trends over time by room
# =============================================================================

fig, axes = plt.subplots(3, 1, figsize=(14, 12), sharex=True)

for ax, col, label, color in zip(
    axes,
    ["temp_am_min", "temp_am_max", "temp_pm"],
    ["Temperature AM Min", "Temperature AM Max", "Temperature PM"],
    ["steelblue", "coral", "seagreen"]
):
    for room in sorted(data["room"].unique()):
        subset = data[data["room"] == room]
        ax.plot(subset["date"], subset[col],
                label=f"Room {room}", linewidth=0.9, alpha=0.85)
    ax.set_title(label, fontsize=11)
    ax.set_ylabel("Temperature (°C)")
    ax.legend(title="Room", bbox_to_anchor=(1.01, 1), loc="upper left", fontsize=8)
    ax.grid(True, linestyle="--", alpha=0.4)

axes[-1].set_xlabel("Date")
plt.suptitle("Temperature Trends Over Time by Room", fontsize=13, y=1.01)
plt.tight_layout()
plt.savefig(os.path.join(INPUT_DIR, "temperature_trends.png"), dpi=150)
plt.show()

In [None]:
# =============================================================================
# FIGURE — Relative humidity trends over time by room
# =============================================================================

fig, axes = plt.subplots(2, 1, figsize=(14, 8), sharex=True)

for ax, col, label in zip(
    axes,
    ["rh_am", "rh_pm"],
    ["Relative Humidity AM", "Relative Humidity PM"]
):
    for room in sorted(data["room"].unique()):
        subset = data[data["room"] == room]
        ax.plot(subset["date"], subset[col],
                label=f"Room {room}", linewidth=0.9, alpha=0.85)
    ax.set_title(label, fontsize=11)
    ax.set_ylabel("Relative Humidity (%)")
    ax.legend(title="Room", bbox_to_anchor=(1.01, 1), loc="upper left", fontsize=8)
    ax.grid(True, linestyle="--", alpha=0.4)

axes[-1].set_xlabel("Date")
plt.suptitle("Relative Humidity Trends Over Time by Room", fontsize=13, y=1.01)
plt.tight_layout()
plt.savefig(os.path.join(INPUT_DIR, "humidity_trends.png"), dpi=150)
plt.show()

In [None]:
# =============================================================================
# FIGURE — AM vs PM boxplot comparison by room
# =============================================================================

# Reshape to long format for AM vs PM comparison
temp_am = data[["room", "date", "temp_am_max"]].copy()
temp_am.rename(columns={"temp_am_max": "temperature"}, inplace=True)
temp_am["period"] = "AM"

temp_pm = data[["room", "date", "temp_pm"]].copy()
temp_pm.rename(columns={"temp_pm": "temperature"}, inplace=True)
temp_pm["period"] = "PM"

rh_am = data[["room", "date", "rh_am"]].copy()
rh_am.rename(columns={"rh_am": "humidity"}, inplace=True)
rh_am["period"] = "AM"

rh_pm = data[["room", "date", "rh_pm"]].copy()
rh_pm.rename(columns={"rh_pm": "humidity"}, inplace=True)
rh_pm["period"] = "PM"

temp_long = pd.concat([temp_am, temp_pm], ignore_index=True)
rh_long   = pd.concat([rh_am,   rh_pm],   ignore_index=True)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.boxplot(data=temp_long, x="room", y="temperature", hue="period",
            palette={"AM": "steelblue", "PM": "coral"}, ax=axes[0])
axes[0].set_title("Temperature — AM vs PM by Room", fontsize=12)
axes[0].set_xlabel("Room")
axes[0].set_ylabel("Temperature (°C)")
axes[0].grid(True, linestyle="--", alpha=0.4)

sns.boxplot(data=rh_long, x="room", y="humidity", hue="period",
            palette={"AM": "steelblue", "PM": "coral"}, ax=axes[1])
axes[1].set_title("Relative Humidity — AM vs PM by Room", fontsize=12)
axes[1].set_xlabel("Room")
axes[1].set_ylabel("Relative Humidity (%)")
axes[1].grid(True, linestyle="--", alpha=0.4)

plt.tight_layout()
plt.savefig(os.path.join(INPUT_DIR, "am_pm_comparison.png"), dpi=150)
plt.show()

In [None]:
# =============================================================================
# STATISTICAL TEST — Independent t-test AM vs PM
# =============================================================================

print("=" * 60)
print("  AM vs PM T-Test Results")
print("=" * 60)

for label, am_col, pm_col in [
    ("Temperature", "temp_am_max", "temp_pm"),
    ("Relative Humidity", "rh_am", "rh_pm")
]:
    am   = data[am_col].dropna()
    pm   = data[pm_col].dropna()
    t, p = stats.ttest_ind(am, pm)
    sig  = "significant" if p < 0.05 else "not significant"

    print(f"\n  {label}")
    print(f"    AM mean     : {am.mean():.4f}")
    print(f"    PM mean     : {pm.mean():.4f}")
    print(f"    t-statistic : {t:.4f}")
    print(f"    p-value     : {p:.4f}  ({sig} at α = 0.05)")

print("\n" + "=" * 60)