### initializing Notebook

In [5]:
import pandas as pd
import os

silent = globals().get("silent", False)


### Load the Data

In [6]:
from config import DATA_PATH

df = pd.read_csv(DATA_PATH + "/Air_Quality.csv")
if not silent:
    display(df.head())

Unnamed: 0,Unique ID,Indicator ID,Name,Measure,Measure Info,Geo Type Name,Geo Join ID,Geo Place Name,Time Period,Start_Date,Data Value,Message
0,336867,375,Nitrogen dioxide (NO2),Mean,ppb,CD,407,Flushing and Whitestone (CD7),Winter 2014-15,12/01/2014,23.97,
1,336741,375,Nitrogen dioxide (NO2),Mean,ppb,CD,107,Upper West Side (CD7),Winter 2014-15,12/01/2014,27.42,
2,550157,375,Nitrogen dioxide (NO2),Mean,ppb,CD,414,Rockaway and Broad Channel (CD14),Annual Average 2017,01/01/2017,12.55,
3,412802,375,Nitrogen dioxide (NO2),Mean,ppb,CD,407,Flushing and Whitestone (CD7),Winter 2015-16,12/01/2015,22.63,
4,412803,375,Nitrogen dioxide (NO2),Mean,ppb,CD,407,Flushing and Whitestone (CD7),Summer 2016,06/01/2016,14.0,


# Data preparation

Data Explanation

The dataset provides air quality measurements for nitrogen dioxide (NO2) across different locations and time periods. Here's a brief overview of each column:

- **Unique ID**: A unique identifier for each record.
- **Indicator ID**: Numeric code for the environmental indicator. `ID for column 'Name'`
- **Name**: The indicator being measured (e.g., NO2).
- **Measure**: Type of measurement (e.g., "Mean").
- **Measure Info**: Unit of measurement (e.g., "ppb").
- **Geo Type Name**: Type of geographical area (e.g., "CD" for Congressional District).
- **Geo Join ID**: Geographic area identifier.
- **Geo Place Name**: Name of the area (e.g., "Flushing and Whitestone (CD7)").
- **Time Period**: The period when data was recorded (e.g., "Winter 2014-15").
- **Start_Date**: Start date of the data (MM/DD/YYYY).
- **Data Value**: The recorded NO2 value (e.g., 23.97 ppb).
- **Message**: Additional notes (if any).


In [None]:
# show and delete columns where all values are missing
colsAllMissing = df.columns[df.isnull().all()]
if not silent:
    print(f"The Data contains the following collumns which have no values: {colsAllMissing.values}")
df = df.drop(columns=colsAllMissing)

if not silent:
    print(f"\nBefore the column name transformation and the casting the dataframe had the following structure:\n{df.dtypes.to_frame(name='Data Type')}")

def toCamelCase(s):
    parts = s.split(' ')
    return parts[0].lower() + ''.join(word.capitalize() for word in parts[1:])

# Change column names to camel case
df.columns = [toCamelCase(col.replace('_', ' ')) for col in df.columns]
if not silent:
    print(f"\nDataframe now has the columns: {df.columns.values}")

# cast values
df['startDate'] = pd.to_datetime(df['startDate'])
df['name'] = df['name'].astype('str')
df['measure'] = df['measure'].astype('category')
df['measureInfo'] = df['measureInfo'].astype('category')
df['geoTypeName'] = df['geoTypeName'].astype('category')

print(f"\nThe dataframe has the following structure:\n{df.dtypes.to_frame(name='Data Type')}")



The Data contains the following collumns which have no values: ['Message']
Before the column name transformation the dataframe had the following structure:
               Data Type
Unique ID          int64
Indicator ID       int64
Name              object
Measure           object
Measure Info      object
Geo Type Name     object
Geo Join ID        int64
Geo Place Name    object
Time Period       object
Start_Date        object
Data Value       float64
Dataframe now has the columns: ['uniqueId' 'indicatorId' 'name' 'measure' 'measureInfo' 'geoTypeName'
 'geoJoinId' 'geoPlaceName' 'timePeriod' 'startDate' 'dataValue']
The dataframe has the following structure:
                   Data Type
uniqueId               int64
indicatorId            int64
name                  object
measure             category
measureInfo         category
geoTypeName         category
geoJoinId              int64
geoPlaceName          object
timePeriod            object
startDate     datetime64[ns]
dataValue     