In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
def drop_columns_if_exist(df, columns_to_drop = ["latitude","longitude","elevation","status","STATE", "ID", "entry_id","UNIT", "CO", "field7","field8"]):
    # Check if each column exists in the DataFrame before dropping it
    columns_to_drop_existing = [col for col in columns_to_drop if col in df.columns]

    # Drop the existing columns from the DataFrame
    df.drop(columns=columns_to_drop_existing, inplace=True)

    return df

In [3]:
def group_and_average_by_minute(df, datetime_column_name):
    df = drop_columns_if_exist(df)
    # Convert the datetime column to datetime format
    df[datetime_column_name] = pd.to_datetime(df[datetime_column_name])

    # Set the datetime column as the DataFrame index for resampling
    df.set_index(datetime_column_name, inplace=True)

    # Resample the DataFrame to group by minute and calculate the average
    df_avg_per_minute = df.resample('1T').mean()

    # Reset the index to convert the datetime column back to a regular column
    df_avg_per_minute.reset_index(inplace=True)

    # Round the datetime column to the nearest minute
    df_avg_per_minute[datetime_column_name] = df_avg_per_minute[datetime_column_name].dt.round('1min')

    return df_avg_per_minute.dropna().reset_index(drop=True)


In [4]:
import pandas as pd

def merge_three_dataframes_by_datetime(df1, df2, df3, datetime_col_df1, datetime_col_df2, datetime_col_df3, timezone=None):
    # Convert the datetime columns to datetime format with the specified timezone (if provided)
    df1["created_at"] = pd.to_datetime(df1['created_at']) + pd.Timedelta(hours=1) # Change time from UTC to WAT
    df1[datetime_col_df1] = pd.to_datetime(df1[datetime_col_df1]).dt.tz_localize(None)
    df2[datetime_col_df2] = pd.to_datetime(df2[datetime_col_df2]).dt.tz_localize(None)
    df3[datetime_col_df3] = pd.to_datetime(df3[datetime_col_df3]).dt.tz_localize(None)

    # Perform the first inner merge between df1 and df2 based on the datetime columns
    merged_df1 = pd.merge(df1, df2, left_on=datetime_col_df1, right_on=datetime_col_df2, how='inner')

    # Drop the redundant datetime column from df2 as it's no longer needed in the first merged DataFrame
    merged_df1.drop(columns=[datetime_col_df2], inplace=True)

    # Perform the second inner merge between the first merged DataFrame and df3 based on their datetime columns
    merged_df = pd.merge(merged_df1, df3, left_on=datetime_col_df1, right_on=datetime_col_df3, how='inner')

    # Drop the redundant datetime column from df3 as it's no longer needed in the final merged DataFrame
    merged_df.drop(columns=[datetime_col_df3], inplace=True)

    return merged_df


In [5]:
alpha_dataset = pd.DataFrame()
data_length_calib_device = []
data_length_low_cost_sensors = []

for i in range(6):
    print("Processing ################# Day {} ############################".format(i+1))
    alpha = pd.read_csv("Day 0{}/Alpha.csv".format(i+1))
    co2 =  pd.read_csv("Day 0{}/CO2.csv".format(i+1))
    co =  pd.read_csv("Day 0{}/CO.csv".format(i+1))
    data_length_calib_device.append(co.shape[0])
    data_length_low_cost_sensors.append(alpha.shape[0])
    
    
    co2 = co2.rename(columns = {"SHOW":"CO2"})
    co = co.rename(columns = {"SHOW":"CO"})
    co.CO  = co.CO.fillna(0)

    alpha = group_and_average_by_minute(alpha, 'created_at')
    co2 = group_and_average_by_minute(co2, 'TIME')
    co = group_and_average_by_minute(co, 'TIME')

    merged_data = merge_three_dataframes_by_datetime(alpha, co2, co, 'created_at', 'TIME', 'TIME')   
    merged_data = merged_data[30:].reset_index(drop=True)
    alpha_dataset = alpha_dataset.append(merged_data)
    
for i in range(6,9):
    print("Processing ################# Day {} ############################".format(i+1))
    alpha = pd.read_csv("Day 0{}/Alpha.csv".format(i+1))
    co2 =  pd.read_csv("Day 0{}/CO2/CO2.csv".format(i+1))
    co =  pd.read_csv("Day 0{}/CO/CO.csv".format(i+1))
    data_length_calib_device.append(co.shape[0])
    data_length_low_cost_sensors.append(alpha.shape[0])

    co2 = co2.rename(columns = {"SHOW":"CO2"})
    co = co.rename(columns = {"SHOW":"CO"})
    co.CO  = co.CO.fillna(0)

    alpha = group_and_average_by_minute(alpha, 'created_at')
    co2 = group_and_average_by_minute(co2, 'TIME')
    co = group_and_average_by_minute(co, 'TIME')

    merged_data = merge_three_dataframes_by_datetime(alpha, co2, co, 'created_at', 'TIME', 'TIME')    
    merged_data = merged_data[30:].reset_index(drop=True)
    alpha_dataset = alpha_dataset.append(merged_data)
    

alpha_dataset = alpha_dataset.reset_index(drop=True)
alpha_dataset = alpha_dataset.rename(columns={'SHOW': 'CO2',"field1":"Temperature", "field2":"Humidity", "field3":"MQ7_analog", "field4":"MQ9_analog", "field5":"MG811_analog", "field6":"MQ135_analog", "field7":"MG811_Digital", "field8":"MQ7_Digital"})
alpha_dataset.to_csv("alpha.csv", index=False)
alpha_dataset.tail()

Processing ################# Day 1 ############################
Processing ################# Day 2 ############################
Processing ################# Day 3 ############################
Processing ################# Day 4 ############################
Processing ################# Day 5 ############################
Processing ################# Day 6 ############################
Processing ################# Day 7 ############################
Processing ################# Day 8 ############################
Processing ################# Day 9 ############################


Unnamed: 0,created_at,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,CO2
2809,2023-06-29 14:43:00,30.64,67.47,2395.5,3520.0,1698.5,2071.5,583.583333
2810,2023-06-29 14:44:00,30.665,67.985,2399.5,3503.5,1697.5,2076.5,584.5
2811,2023-06-29 14:45:00,30.76,70.285,2413.5,3541.5,1692.5,2084.0,583.416667
2812,2023-06-29 14:46:00,30.92,69.88,2409.5,3515.0,1692.0,2083.0,585.6
2813,2023-06-29 15:40:00,31.62,70.53,2380.0,3460.0,1731.0,2009.0,573.166667


In [6]:
print("sensors", sum(data_length_low_cost_sensors)*4)
print("Calibration device",sum(data_length_calib_device))

print("Estimated Total data points gathered", sum(data_length_low_cost_sensors)*4 + sum(data_length_calib_device))

sensors 30336
Calibration device 12923
Estimated Total data points gathered 43259


In [7]:
Beta_dataset = pd.DataFrame()
for i in range(6):
    print("Processing ################# Day {} ############################".format(i+1))
    Beta = pd.read_csv("Day 0{}/Beta.csv".format(i+1))
    co2 =  pd.read_csv("Day 0{}/CO2.csv".format(i+1))
    co =  pd.read_csv("Day 0{}/CO.csv".format(i+1))
    
    co2 = co2.rename(columns = {"SHOW":"CO2"})
    co = co.rename(columns = {"SHOW":"CO"})
    co.CO  = co.CO.fillna(0)

    Beta = group_and_average_by_minute(Beta, 'created_at')
    co2 = group_and_average_by_minute(co2, 'TIME')
    co = group_and_average_by_minute(co, 'TIME')

    merged_data = merge_three_dataframes_by_datetime(Beta, co2, co, 'created_at', 'TIME', 'TIME')   
    merged_data = merged_data[30:].reset_index(drop=True)
    Beta_dataset = Beta_dataset.append(merged_data)
    
for i in range(6,9):
    print("Processing ################# Day {} ############################".format(i+1))
    Beta = pd.read_csv("Day 0{}/Beta.csv".format(i+1))
    co2 =  pd.read_csv("Day 0{}/CO2/CO2.csv".format(i+1))
    co =  pd.read_csv("Day 0{}/CO/CO.csv".format(i+1))

    co2 = co2.rename(columns = {"SHOW":"CO2"})
    co = co.rename(columns = {"SHOW":"CO"})
    co.CO  = co.CO.fillna(0)

    Beta = group_and_average_by_minute(Beta, 'created_at')
    co2 = group_and_average_by_minute(co2, 'TIME')
    co = group_and_average_by_minute(co, 'TIME')

    merged_data = merge_three_dataframes_by_datetime(Beta, co2, co, 'created_at', 'TIME', 'TIME')    
    merged_data = merged_data[30:].reset_index(drop=True)
    Beta_dataset = Beta_dataset.append(merged_data)
    

Beta_dataset = Beta_dataset.reset_index(drop=True)
Beta_dataset = Beta_dataset.rename(columns={'SHOW': 'CO2',"field1":"Temperature", "field2":"Humidity", "field3":"MQ7_analog", "field4":"MQ9_analog", "field5":"MG811_analog", "field6":"MQ135_analog", "field7":"MG811_Digital", "field8":"MQ7_Digital"})
Beta_dataset.to_csv("Beta.csv", index=False)
Beta_dataset.tail()

Processing ################# Day 1 ############################
Processing ################# Day 2 ############################
Processing ################# Day 3 ############################
Processing ################# Day 4 ############################
Processing ################# Day 5 ############################
Processing ################# Day 6 ############################
Processing ################# Day 7 ############################
Processing ################# Day 8 ############################
Processing ################# Day 9 ############################


Unnamed: 0,created_at,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,CO2
2919,2023-06-29 14:43:00,29.315,68.575,3251.0,4808.0,3515.5,3567.0,583.583333
2920,2023-06-29 14:44:00,29.345,70.175,3232.5,4784.5,3499.0,3555.5,584.5
2921,2023-06-29 14:45:00,29.39,72.345,3247.5,4787.0,3511.5,3560.5,583.416667
2922,2023-06-29 14:46:00,29.47,71.06,3215.0,4746.0,3497.0,3540.0,585.6
2923,2023-06-29 15:40:00,30.29,69.66,3143.5,4711.0,3554.5,3587.0,573.166667


In [8]:
Charlie_dataset = pd.DataFrame()
for i in range(6):
    print("Processing ################# Day {} ############################".format(i+1))
    Charlie = pd.read_csv("Day 0{}/Charlie.csv".format(i+1))
    co2 =  pd.read_csv("Day 0{}/CO2.csv".format(i+1))
    co =  pd.read_csv("Day 0{}/CO.csv".format(i+1))
    
    co2 = co2.rename(columns = {"SHOW":"CO2"})
    co = co.rename(columns = {"SHOW":"CO"})
    co.CO  = co.CO.fillna(0)

    Charlie = group_and_average_by_minute(Charlie, 'created_at')
    co2 = group_and_average_by_minute(co2, 'TIME')
    co = group_and_average_by_minute(co, 'TIME')

    merged_data = merge_three_dataframes_by_datetime(Charlie, co2, co, 'created_at', 'TIME', 'TIME')   
    merged_data = merged_data[30:].reset_index(drop=True)
    Charlie_dataset = Charlie_dataset.append(merged_data)
    
for i in range(6,9):
    print("Processing ################# Day {} ############################".format(i+1))
    Charlie = pd.read_csv("Day 0{}/Charlie.csv".format(i+1))
    co2 =  pd.read_csv("Day 0{}/CO2/CO2.csv".format(i+1))
    co =  pd.read_csv("Day 0{}/CO/CO.csv".format(i+1))

    co2 = co2.rename(columns = {"SHOW":"CO2"})
    co = co.rename(columns = {"SHOW":"CO"})
    co.CO  = co.CO.fillna(0)

    Charlie = group_and_average_by_minute(Charlie, 'created_at')
    co2 = group_and_average_by_minute(co2, 'TIME')
    co = group_and_average_by_minute(co, 'TIME')

    merged_data = merge_three_dataframes_by_datetime(Charlie, co2, co, 'created_at', 'TIME', 'TIME')    
    merged_data = merged_data[30:].reset_index(drop=True)
    Charlie_dataset = Charlie_dataset.append(merged_data)
    

Charlie_dataset = Charlie_dataset.reset_index(drop=True)
Charlie_dataset = Charlie_dataset.rename(columns={'SHOW': 'CO2',"field1":"Temperature", "field2":"Humidity", "field3":"MQ7_analog", "field4":"MQ9_analog", "field5":"MG811_analog", "field6":"MQ135_analog", "field7":"MG811_Digital", "field8":"MQ7_Digital"})
Charlie_dataset.to_csv("Charlie.csv", index=False)
Charlie_dataset.tail()

Processing ################# Day 1 ############################
Processing ################# Day 2 ############################
Processing ################# Day 3 ############################
Processing ################# Day 4 ############################
Processing ################# Day 5 ############################
Processing ################# Day 6 ############################
Processing ################# Day 7 ############################
Processing ################# Day 8 ############################
Processing ################# Day 9 ############################


Unnamed: 0,created_at,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,CO2
2856,2023-06-29 14:43:00,30.42,66.32,3374.0,3243.0,2931.0,2481.0,583.583333
2857,2023-06-29 14:44:00,30.275,68.545,3281.5,3210.5,2885.5,2439.5,584.5
2858,2023-06-29 14:45:00,30.476667,69.843333,3386.333333,3257.333333,2913.333333,2474.0,583.416667
2859,2023-06-29 14:46:00,30.67,69.19,3337.0,3232.0,2905.0,2461.0,585.6
2860,2023-06-29 15:40:00,31.42,67.975,3395.5,3299.5,2882.0,2461.5,573.166667


In [9]:
alpha_dataset.min()#.head()

created_at      2023-05-01 03:29:00
Temperature                  27.665
Humidity                       52.2
MQ7_analog                   2380.0
MQ9_analog                   1098.5
MG811_analog                 1353.0
MQ135_analog                 1186.5
CO2                      573.166667
dtype: object

In [10]:
Beta_dataset.min()#.head()

created_at      2023-05-01 03:30:00
Temperature                   23.44
Humidity                  59.903333
MQ7_analog                   3109.5
MQ9_analog                   3555.0
MG811_analog                 2412.5
MQ135_analog                 3156.5
CO2                      573.166667
dtype: object

In [11]:
Charlie_dataset.min()#.head()

created_at      2023-05-01 03:29:00
Temperature                   23.93
Humidity                     60.935
MQ7_analog                   3128.0
MQ9_analog                   3210.5
MG811_analog                 2479.0
MQ135_analog                 2381.0
CO2                      573.166667
dtype: object

In [12]:
alpha_dataset.describe()

Unnamed: 0,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,CO2
count,2814.0,2814.0,2814.0,2814.0,2814.0,2814.0,2814.0
mean,31.830263,62.862624,3630.742715,2203.750118,4279.351042,2548.040393,611.46564
std,1.233422,5.447757,1082.643577,985.415917,1337.378345,727.726906,16.391828
min,27.665,52.2,2380.0,1098.5,1353.0,1186.5,573.166667
25%,31.715,58.905,2863.625,1498.541667,3171.125,2072.625,600.0
50%,31.91,61.455,3341.0,1729.25,4425.166667,2682.0,608.0
75%,32.54,66.875,4023.875,2810.875,5184.0,2935.5,621.0
max,33.85,77.395,9545.5,7823.0,7919.0,5594.0,677.0


In [13]:
Beta_dataset.describe()

Unnamed: 0,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,CO2
count,2924.0,2924.0,2924.0,2924.0,2924.0,2924.0,2924.0
mean,28.244193,74.063788,3983.875456,4497.156065,3407.310078,4000.155894,611.855441
std,1.38156,5.494902,610.322973,1058.452935,643.839651,518.073859,16.673599
min,23.44,59.903333,3109.5,3555.0,2412.5,3156.5,573.166667
25%,27.775,71.17375,3455.375,3905.5,2794.25,3589.0,600.0
50%,28.4,74.185,3867.75,4085.5,3246.25,3819.25,608.0
75%,29.1,75.31,4424.625,4832.0,4067.0,4369.5,621.5
max,32.305,93.525,8021.0,10433.0,5119.0,6285.0,677.0


In [14]:
Charlie_dataset.describe()

Unnamed: 0,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,CO2
count,2861.0,2861.0,2861.0,2861.0,2861.0,2861.0,2861.0
mean,29.039685,71.57465,5054.418968,5163.769253,4292.091926,3738.785564,611.916367
std,1.301354,5.351279,951.077221,924.096242,949.952667,644.314595,16.798683
min,23.93,60.935,3128.0,3210.5,2479.0,2381.0,573.166667
25%,28.69,68.715,4316.0,4358.0,3196.0,3199.5,600.0
50%,29.23,71.25,4982.0,5366.5,4471.5,3778.0,608.0
75%,29.57,73.975,5826.0,5693.5,4981.5,4214.5,622.0
max,32.72,90.55,8136.0,8141.0,6052.0,5105.0,677.0
