In [31]:
import os
import pandas as pd
import json
from io import StringIO



data_dir = './'
if not os.path.exists(data_dir):
    print(f"Error: Data directory '{data_dir}' not found.")
    
standardized_data = []
# These are the standard names we will use for the DataFrame columns.
standard_columns = ['timestamp', 'co2', 'humidity', 'temperature']

for filename in os.listdir(data_dir):
    if filename.endswith('.ndjson'):
        # Create a clean room name, e.g., 'room_a.txt' -> 'Room A'
        room_name = filename.split('.')[0].replace('_', ' ')[-6:].title()
        file_path = os.path.join(data_dir, filename)
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                for i, line in enumerate(f):
                    # Skip empty or whitespace-only lines
                    if not line.strip():
                        continue
                    
                    try:
                        # Load the JSON object from the line
                        data_dict = json.loads(line)
                        
                        # Get values in their insertion order (works in Python 3.7+)
                        values = list(data_dict.values())
                        
                        # Validate that we have the expected number of fields
                        if len(values) != 4:
                            print(f"Warning: Skipping malformed line {i+1} in {filename} (expected 4 fields, got {len(values)}).")
                            continue

                        # Create a standardized dictionary by zipping keys and values
                        record = dict(zip(standard_columns, values))
                        record['room'] = room_name  # Add the room name
                        standardized_data.append(record)
                        
                    except (json.JSONDecodeError, IndexError) as e:
                        print(f"Warning: Skipping corrupted line {i+1} in {filename}. Error: {e}")
                        continue

        except Exception as e:
            print(f"Error reading file {filename}: {e}")
            continue
            
if not standardized_data:
    print("Error: No data could be loaded. Check file contents and paths.")

# Create the DataFrame from our list of standardized records
df = pd.DataFrame(standardized_data)

# Convert relevant columns to the correct data types, coercing errors
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
for col in ['co2', 'humidity', 'temperature']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop any rows where critical data conversion failed
df.dropna(subset=['timestamp', 'co2', 'humidity', 'temperature'], inplace=True)



In [32]:
room = "Room 1"
df_room1 = df[df['room'] == room]
average_co2 = df_room1['co2'].mean()

In [33]:
average_co2

np.float64(701.2419330855018)