# 2. Data Explorations

## Import libraries and load datasets

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load data
df_weather = pd.read_csv('../data/hcmc_weather_data.csv')
df_aq = pd.read_csv('../data/hcmc_air_quality_data.csv')

In [3]:
print("Weather Data Info:")
print(df_weather.info())

Weather Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   date_time                768 non-null    object 
 1   temperature_2m           768 non-null    float64
 2   relative_humidity_2m     768 non-null    float64
 3   dew_point_2m             768 non-null    float64
 4   apparent_temperature     768 non-null    float64
 5   precipitation            768 non-null    float64
 6   cloud_cover              768 non-null    float64
 7   vapour_pressure_deficit  768 non-null    float64
 8   wind_speed_10m           768 non-null    float64
 9   wind_direction_10m       768 non-null    float64
 10  weather_code             768 non-null    float64
dtypes: float64(10), object(1)
memory usage: 66.1+ KB
None


In [4]:
print("\nAir Quality Data Info:")
print(df_aq.info())


Air Quality Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date_time         768 non-null    object 
 1   pm10              768 non-null    float64
 2   pm2_5             768 non-null    float64
 3   carbon_monoxide   768 non-null    float64
 4   nitrogen_dioxide  768 non-null    float64
 5   sulphur_dioxide   768 non-null    float64
 6   ozone             768 non-null    float64
 7   us_aqi            768 non-null    float64
dtypes: float64(7), object(1)
memory usage: 48.1+ KB
None


In [5]:
df_weather.head()

Unnamed: 0,date_time,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,cloud_cover,vapour_pressure_deficit,wind_speed_10m,wind_direction_10m,weather_code
0,2024-10-01 00:00:00+00:00,26.661001,92.32073,25.311,32.43071,0.0,96.0,0.268107,8.049845,206.56499,3.0
1,2024-10-01 01:00:00+00:00,28.211,85.82722,25.611,34.623367,0.0,96.0,0.541705,5.004518,232.3057,3.0
2,2024-10-01 02:00:00+00:00,29.511,79.128654,25.511,35.912186,0.0,96.0,0.86001,4.863332,231.009,3.0
3,2024-10-01 03:00:00+00:00,30.761,70.64291,24.811,37.899853,0.1,97.0,1.299534,5.081613,247.06787,51.0
4,2024-10-01 04:00:00+00:00,31.511,65.69293,24.311,38.29917,0.1,100.0,1.584881,7.928178,267.3975,51.0


In [6]:
df_aq.head()

Unnamed: 0,date_time,pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,sulphur_dioxide,ozone,us_aqi
0,2024-10-01 00:00:00+00:00,32.8,22.1,443.0,29.9,46.1,16.0,77.73936
1,2024-10-01 01:00:00+00:00,30.5,20.2,398.0,26.1,41.3,26.0,77.60638
2,2024-10-01 02:00:00+00:00,27.7,18.3,335.0,20.8,34.6,40.0,77.85461
3,2024-10-01 03:00:00+00:00,28.6,19.1,286.0,16.4,29.3,57.0,77.969864
4,2024-10-01 04:00:00+00:00,31.5,20.9,265.0,13.5,26.7,79.0,78.01418


## 2.2. Data Preprocessing

### Data combination

In [7]:
# Combine data into one dataframe
df = pd.merge(df_weather, df_aq, on='date_time', how='outer')
df = df.sort_values(by='date_time')
df

Unnamed: 0,date_time,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,cloud_cover,vapour_pressure_deficit,wind_speed_10m,wind_direction_10m,weather_code,pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,sulphur_dioxide,ozone,us_aqi
0,2024-10-01 00:00:00+00:00,26.661001,92.320730,25.311000,32.430710,0.0,96.0,0.268107,8.049845,206.564990,3.0,32.8,22.1,443.0,29.9,46.1,16.0,77.739360
1,2024-10-01 01:00:00+00:00,28.211000,85.827220,25.611000,34.623367,0.0,96.0,0.541705,5.004518,232.305700,3.0,30.5,20.2,398.0,26.1,41.3,26.0,77.606380
2,2024-10-01 02:00:00+00:00,29.511000,79.128654,25.511000,35.912186,0.0,96.0,0.860010,4.863332,231.009000,3.0,27.7,18.3,335.0,20.8,34.6,40.0,77.854610
3,2024-10-01 03:00:00+00:00,30.761000,70.642910,24.811000,37.899853,0.1,97.0,1.299534,5.081613,247.067870,51.0,28.6,19.1,286.0,16.4,29.3,57.0,77.969864
4,2024-10-01 04:00:00+00:00,31.511000,65.692930,24.311000,38.299170,0.1,100.0,1.584881,7.928178,267.397500,51.0,31.5,20.9,265.0,13.5,26.7,79.0,78.014180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,2024-11-01 19:00:00+00:00,24.711000,95.318750,23.911001,29.149628,0.2,100.0,0.145639,11.032987,123.690100,51.0,46.2,32.2,520.0,33.9,32.2,55.0,90.992900
764,2024-11-01 20:00:00+00:00,24.361000,95.594310,23.611000,28.979694,0.1,100.0,0.134234,8.538150,145.304780,51.0,36.7,25.4,506.0,27.3,25.3,68.0,91.755320
765,2024-11-01 21:00:00+00:00,23.961000,96.742645,23.411001,28.327835,0.0,98.0,0.096899,9.437563,124.902565,3.0,29.0,20.0,558.0,22.6,20.1,77.0,92.012405
766,2024-11-01 22:00:00+00:00,23.861000,96.157630,23.211000,28.166320,0.0,92.0,0.113621,9.037721,134.193160,3.0,25.0,17.2,725.0,22.0,18.0,75.0,91.781920


Some tasks should be done before we need to proceed in this phase
1. Since the data contains `17544 non-null entries`, there is no need to handle missing values.
2. If we use `weather_code`, it will confuse the reader and they won't know what type of weather it is, so we will change it to `weather_status`.
3. Detect and handle if having invalid values.
    * `temperature_2m`, `apparent_temperature` at HCMC should be between 10°C and 50°C
    * `relative_humidity_2m` should be between 0% and 100%
    * `precipitation`, `cloud_cover`, `wind_speed_10m`, `pm10`, `pm2_5`, `carbon_monoxide`, `nitrogen_dioxide`, `sulphur_dioxide` and `ozone` should not be negative
4. Check the continuity of time.
5. Detect and handle outliers.
6. Check the validity of the relationship between the variables.

### Map `weather_code` to `weather_status`

In [8]:
def map_weather_code(code):
    '''
    Mapping weather code to weather status based on WMO Weather interpretation codes (WW)
    
    -------------
    Parameters:
        code (int): Weather code

    -------------
    Returns:
        str: Weather

    '''
    weather_codes = {
        0: 'Clear Sky',
        1: 'Mainly Clear',
        2: 'Partly Cloudy',
        3: 'Overcast',
        45: 'Foggy',
        48: 'Depositing Rime Fog',
        51: 'Light Drizzle',
        53: 'Moderate Drizzle',
        55: 'Dense Drizzle',
        56: 'Light Freezing Drizzle',
        57: 'Dense Freezing Drizzle',
        61: 'Light Rain',
        63: 'Moderate Rain',
        65: 'Heavy Rain',
        66: 'Light Freezing Rain',
        67: 'Heavy Freezing Rain',
        71: 'Light Snow',
        73: 'Moderate Snow',
        75: 'Heavy Snow',
        77: 'Snow Grains',
        80: 'Light Rain Showers',
        81: 'Moderate Rain Showers',
        82: 'Violent Rain Showers',
        85: 'Light Snow Showers',
        86: 'Heavy Snow Showers',
    }
    return weather_codes.get(code, 'Unknown')

# Map weather code to weather description
df['weather_status'] = df['weather_code'].apply(map_weather_code)
df.drop(columns=['weather_code'], inplace=True)

# Check result
print("Unique weather statuses: ", end='')
print(df['weather_status'].unique())

Unique weather statuses: ['Overcast' 'Light Drizzle' 'Partly Cloudy' 'Moderate Rain'
 'Moderate Drizzle' 'Light Rain' 'Dense Drizzle' 'Mainly Clear'
 'Clear Sky' 'Heavy Rain']


### Detect invalid values

In [9]:
# `temperature_2m`, `apparent_temperature` at HCMC should be between 10°C and 50°C
invalid_values = False

if df['temperature_2m'].min() < 10 or df['temperature_2m'].max() > 50:
    print("Invalid temperature_2m values")
    invalid_values = True
if df['apparent_temperature'].min() < 10 or df['apparent_temperature'].max() > 50:
    print("Invalid apparent_temperature values")
    invalid_values = True

# `relative_humidity_2m` should be between 0% and 100%
if df['relative_humidity_2m'].min() < 0 or df['relative_humidity_2m'].max() > 100:
    print("Invalid relative_humidity_2m values")
    invalid_values = True

# `precipitation`, `cloud_cover`, `wind_speed_10m`, `pm10`, `pm2_5`, `carbon_monoxide`, 
# `nitrogen_dioxide`, `sulphur_dioxide` and `ozone` should not be negative
non_negative_columns = ['precipitation', 'cloud_cover', 'wind_speed_10m', 'pm10', 'pm2_5', 
                        'carbon_monoxide', 'nitrogen_dioxide', 'sulphur_dioxide', 'ozone']
if any(df[non_negative_columns].lt(0).any()):
    print("Invalid negative values")
    invalid_values = True

if not invalid_values:
    print("Valid values")

Valid values


### Check the continuity of time

In [10]:
full_range = pd.date_range(start=df['date_time'].min(), end=df['date_time'].max(), freq='h')
df['date_time'] = pd.to_datetime(df['date_time'])
missing_times = full_range[~full_range.isin(df['date_time'])]

if len(missing_times) == 0:
    print("No missing time")
else:
    print("Missing times:")
    print(missing_times)

No missing time


### Detect and handle outliers

In [11]:
def analyze_outliers(df, columns):
    '''
    Analyzes outliers using the IQR method.
    
    -------------
    Parameters:
        df (pandas.DataFrame): The DataFrame containing the data to analyze.
        columns (list of str): The list of column names to check for outliers.
    
    -------------
    Returns:
        dict: A dictionary where keys are column names and values are lists of outlier values.
    '''
    outlier_info = {}

    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]

        outlier_info[column] = {
            'total_outliers': len(outliers),
            'percentage': (len(outliers) / len(df)) * 100,
            'min': outliers.min() if len(outliers) > 0 else None,
            'max': outliers.max() if len(outliers) > 0 else None
        }
    
    return outlier_info

# Analyze outliers
df_numeric = df.select_dtypes(include=[np.number])
outlier_info = analyze_outliers(df_numeric, df_numeric.columns)

print("Outlier Information:")
for column, info in outlier_info.items():
    print(f"\tOutliers in {column}:")
    print(f"\tTotal outliers: {info['total_outliers']} ({info['percentage']:.2f}%)")
    print(f"\tMin: {info['min']}")
    print(f"\tMax: {info['max']}")
    print()

Outlier Information:
	Outliers in temperature_2m:
	Total outliers: 0 (0.00%)
	Min: None
	Max: None

	Outliers in relative_humidity_2m:
	Total outliers: 3 (0.39%)
	Min: 54.9907
	Max: 56.56801

	Outliers in dew_point_2m:
	Total outliers: 24 (3.12%)
	Min: 21.861
	Max: 26.061

	Outliers in apparent_temperature:
	Total outliers: 19 (2.47%)
	Min: 38.869396
	Max: 41.303314

	Outliers in precipitation:
	Total outliers: 112 (14.58%)
	Min: 0.6
	Max: 13.4

	Outliers in cloud_cover:
	Total outliers: 148 (19.27%)
	Min: 1.0
	Max: 87.0

	Outliers in vapour_pressure_deficit:
	Total outliers: 17 (2.21%)
	Min: 1.709928
	Max: 2.1882005

	Outliers in wind_speed_10m:
	Total outliers: 11 (1.43%)
	Min: 16.029099
	Max: 21.599998

	Outliers in wind_direction_10m:
	Total outliers: 0 (0.00%)
	Min: None
	Max: None

	Outliers in pm10:
	Total outliers: 15 (1.95%)
	Min: 81.7
	Max: 101.0

	Outliers in pm2_5:
	Total outliers: 15 (1.95%)
	Min: 56.0
	Max: 69.8

	Outliers in carbon_monoxide:
	Total outliers: 46 (5.99%)
	

Based on the above outliers, I suggest that we should split into 2 groups
* Keeped outliers:
    * `precipitation`   : High precipitation is a real natural phenomenon
    * `cloud_cover`     : Cloud cover of 95-100% is normal',
    * `wind_speed_10m`  : 'Strong winds are a real weather phenomenon',
    * `pm10`            : 'High air pollution is a real phenomenon',
    * `pm2_5`           : 'High air pollution is a real phenomenon',
    * `carbon_monoxide` : 'High CO levels may be due to actual pollution',
    * `nitrogen_dioxide`: 'High NO2 levels may be due to actual pollution',
    * `ozone`           : 'High ozone levels may be due to actual weather conditions'
* Handled outliers:
    * the others

In [12]:
def handle_outliers(df, columns_handle, method='iqr'):
    '''
    Handles outliers in specified columns of a DataFrame using the specified method.

    -------------
    Parameters:
        df (pandas.DataFrame): The input DataFrame.
        columns_handle (list): List of column names to handle outliers for.
        method (str): The method to use for handling outliers. 
                    Options are 'iqr' (Interquartile Range) and 'percentile'. 
                    Default is 'iqr'.
                
    -------------
    Returns:
        pandas.DataFrame: A DataFrame with outliers handled in the specified columns.
    '''
    df_cleaned = df.copy()

    for column in columns_handle:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        if method == 'iqr':
            df_cleaned[column] = df_cleaned[column].clip(lower_bound, upper_bound)
        elif method == 'percentile':
            # using percentile 5 and 95
            lower_bound = df[column].quantile(0.05)
            upper_bound = df[column].quantile(0.95)
            df_cleaned[column] = df_cleaned[column].clip(lower_bound, upper_bound)

    return df_cleaned

# Handle outliers
columns_to_handle = {
    'temperature_2m': 'percentile', 
    'relative_humidity_2m': 'iqr',
    'dew_point_2m': 'iqr',
    'apparent_temperature': 'percentile',
    'vapour_pressure_deficit': 'iqr',
    'sulphur_dioxide': 'iqr',
    'us_aqi': 'iqr'
}

df_cleaned = handle_outliers(df, columns_to_handle.keys(), method='iqr')

### Check the validity of the relationship between the variables

In [13]:
def check_data_relationships(df):
    '''
    Checks relationships between columns in the DataFrame.

    -------------
    Parameters:
        df (pandas.DataFrame): The input DataFrame.

    -------------
    Returns:
        list: A list of issues found in the data.
    '''
    issues = []

    # Check temperature_2m and dew_point_2m
    if 'temperature_2m' in df.columns and 'dew_point_2m' in df.columns:
        if any(df['dew_point_2m'] > df['temperature_2m']):
            issues.append("Dew point temperature is greater than air temperature")
        
    # Check cloud_cover and precipitation
    if 'cloud_cover' in df.columns and 'precipitation' in df.columns:
        if any((df['cloud_cover'] == 0) & (df['precipitation'] > 0)):
            issues.append("Cloud cover is 0 but there is precipitation")

    return issues

# Check data relationships
relationship_issues = check_data_relationships(df_cleaned)
print("Relationship Issues:", relationship_issues)

Relationship Issues: []


### Check constraints of project and save to file

In [15]:
# Check constraints
assert df.shape[1] >= 5 and df.shape[0] >= 1000, "Data does not meet the constraints"
print("Data meets the constraints")
print(f"Number of attributes: {df.shape[1]}")
print(f"Number of records: {df.shape[0]}")

AssertionError: Data does not meet the constraints

### Change timezone to UTC+7 (HoChiMinh)

In [16]:
def change_timezone(data):
    data_datetime = pd.to_datetime(data['date_time'], utc=True)
    data_datetime = data_datetime.dt.tz_convert('Asia/Ho_Chi_Minh')
    data_datetime = data_datetime.dt.tz_localize(None)
    data['date_time'] = data_datetime
    return data

df_cleaned = change_timezone(df_cleaned)

### Save cleaned data

In [None]:
# Save cleaned data
df_cleaned.to_csv('../data/clean_hcmc_waq.csv', index=False)