In [19]:
import pandas as pd
import sqlite3

# Function to process a CSV and return a cleaned DataFrame
def process_csv(file_path, columns_to_keep):
    # Load the CSV data into a Pandas DataFrame
    df = pd.read_csv(file_path)

    # Filter the DataFrame to only include the specified columns
    df = df[columns_to_keep]

    # Convert the 'timestamp' column to datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

    # Separate 'timestamp' into 'date' and 'time' columns
    df['date'] = df['timestamp'].dt.date
    df['time'] = df['timestamp'].dt.strftime('%H:%M:%S')

    # Drop the original 'timestamp' column
    df = df.drop(columns=['timestamp'])

    # Fill missing values with 'missing'
    df = df.fillna('missing')

    return df

# List of columns to keep
columns_to_keep = [
    'Overall ID', 'Experiment ID', 'Location ID', 'Measurement ID',
    'Experiment Condition', 'timestamp', 'time',
    'Temperature (Weather)', 'Humidity (Weather)', 'Wind Speed (Weather)',
    'AQI (Pollution)', 'CO Concentration', 'NO2 Concentration'
]

# Update the database path (new database name)
db_path = '/Users/stephaniebellew/Desktop/Metadata_Ventilation_Experiment.db'

# Connect to SQLite database (it will create the file if it doesn't exist)
conn = sqlite3.connect(db_path)

# List of file paths for the four CSVs
file_paths = [
    '/Users/stephaniebellew/Desktop/Experiment1_Location1.csv',
    '/Users/stephaniebellew/Desktop/Experiment1_Location2.csv',
    '/Users/stephaniebellew/Desktop/Experiment2_location1_.csv',
    '/Users/stephaniebellew/Desktop/Experiment2_location2_.csv'
]

# Process each CSV file and append the data to the database
for file_path in file_paths:
    df = process_csv(file_path, columns_to_keep)
    df.to_sql('Metadata_Ventilation_Experiment', conn, if_exists='append', index=False)

# Verify by fetching some data from the database
query = "SELECT * FROM Metadata_Ventilation_Experiment LIMIT 100000;"
result = pd.read_sql(query, conn)

# Display the result (first 5 rows of the table)
print(result)

# Close the connection to the database
conn.close()


      Overall ID  Experiment ID  Location ID  Measurement ID  \
0          1_1_1              1            1               1   
1          1_1_2              1            1               2   
2          1_1_3              1            1               3   
3          1_1_4              1            1               4   
4          1_1_5              1            1               5   
...          ...            ...          ...             ...   
99995  2_2_14415              2            2           14415   
99996  2_2_14416              2            2           14416   
99997  2_2_14417              2            2           14417   
99998  2_2_14418              2            2           14418   
99999  2_2_14419              2            2           14419   

      Experiment Condition      time  Temperature (Weather)  \
0            Window Closed  22:34:26                   1.29   
1            Window Closed  22:34:27                   1.29   
2            Window Closed  22:34:29      