In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
data= pd.read_csv('updated_ozone.csv')

In [13]:
# Convert 'Date' to datetime and set it as the index
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)

In [14]:
# Column of interest
column = 'Daily Max 8-hour Ozone Concentration'

if column in data.columns:
    data[column] = data[column].round(3)

In [15]:
# Step 1: Remove extreme gaps (detect large stretches of NaN or near-zero values)
threshold = 0.001  # Threshold to identify near-zero or invalid values
data[column] = data[column].where(data[column] > threshold, None)  # Set near-zero values to NaN

In [16]:
# Step 2: Interpolate missing values using time-weighted interpolation
data[column] = data[column].interpolate(method='time')

In [17]:
# Step 3: Apply seasonal averages to improve gaps further (if long gaps exist)
if data[column].isnull().sum() > 0:
    # Calculate seasonal (monthly) averages and fill remaining gaps
    monthly_avg = data[column].groupby(data.index.month).transform('mean')
    data[column].fillna(monthly_avg, inplace=True)

In [18]:
# Step 4: Drop any remaining rows with missing or invalid data
data.dropna(subset=[column], inplace=True)

In [20]:
data.isnull().sum()

Source                                  0
Site ID                                 0
POC                                     0
Daily Max 8-hour Ozone Concentration    0
Units                                   0
Daily AQI Value                         0
Local Site Name                         0
Daily Obs Count                         0
Percent Complete                        0
AQS Parameter Code                      0
AQS Parameter Description               0
Method Code                             0
CBSA Code                               0
CBSA Name                               0
State FIPS Code                         0
State                                   0
County FIPS Code                        0
County                                  0
Site Latitude                           0
Site Longitude                          0
dtype: int64

In [21]:
cleaned_file_path = 'cleaned_ozone_data.csv'
data.to_csv(cleaned_file_path)

print(f"Cleaned dataset saved to: {cleaned_file_path}")

Cleaned dataset saved to: cleaned_ozone_data.csv


In [22]:
df1 = pd.read_csv('cleaned_ozone_data.csv')
df1

Unnamed: 0,Date,Source,Site ID,POC,Daily Max 8-hour Ozone Concentration,Units,Daily AQI Value,Local Site Name,Daily Obs Count,Percent Complete,...,AQS Parameter Description,Method Code,CBSA Code,CBSA Name,State FIPS Code,State,County FIPS Code,County,Site Latitude,Site Longitude
0,2018-03-01,AQS,371010002.0,1.0,0.037,ppm,34.0,West Johnston Co.,17.0,100.0,...,Ozone,47.0,39580.0,"Raleigh, NC",37.0,North Carolina,101.0,Johnston,35.59095,-78.4622
1,2018-03-02,AQS,371010002.0,1.0,0.046,ppm,43.0,West Johnston Co.,17.0,100.0,...,Ozone,47.0,39580.0,"Raleigh, NC",37.0,North Carolina,101.0,Johnston,35.59095,-78.4622
2,2018-03-03,AQS,371010002.0,1.0,0.047,ppm,44.0,West Johnston Co.,17.0,100.0,...,Ozone,47.0,39580.0,"Raleigh, NC",37.0,North Carolina,101.0,Johnston,35.59095,-78.4622
3,2018-03-04,AQS,371010002.0,1.0,0.045,ppm,42.0,West Johnston Co.,17.0,100.0,...,Ozone,47.0,39580.0,"Raleigh, NC",37.0,North Carolina,101.0,Johnston,35.59095,-78.4622
4,2018-03-05,AQS,371010002.0,1.0,0.046,ppm,43.0,West Johnston Co.,17.0,100.0,...,Ozone,47.0,39580.0,"Raleigh, NC",37.0,North Carolina,101.0,Johnston,35.59095,-78.4622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2418,2024-10-13,AirNow,371010002.0,1.0,0.050,ppm,46.0,West Johnston Co.,24.0,100.0,...,Ozone,19.0,39580.0,"Raleigh, NC",37.0,North Carolina,101.0,Johnston,35.59095,-78.4622
2419,2024-10-14,AirNow,371010002.0,1.0,0.050,ppm,46.0,West Johnston Co.,24.0,100.0,...,Ozone,19.0,39580.0,"Raleigh, NC",37.0,North Carolina,101.0,Johnston,35.59095,-78.4622
2420,2024-10-15,AirNow,371010002.0,1.0,0.036,ppm,33.0,West Johnston Co.,24.0,100.0,...,Ozone,19.0,39580.0,"Raleigh, NC",37.0,North Carolina,101.0,Johnston,35.59095,-78.4622
2421,2024-10-16,AirNow,371010002.0,1.0,0.031,ppm,29.0,West Johnston Co.,24.0,100.0,...,Ozone,19.0,39580.0,"Raleigh, NC",37.0,North Carolina,101.0,Johnston,35.59095,-78.4622
