## Початкове завантаження бібліотек та набору даних

In [1]:
# Завантажеємо початкові модулі
import pandas as pd
import numpy as np

In [2]:
# Читаємо весь наявний набір даних
air_data = pd.read_csv("city_day.csv")

In [4]:
# Виводимо набір даних
air_data

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.00,0.02,0.00,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.50,3.77,,
2,Ahmedabad,2015-01-03,,,17.40,19.30,29.70,,17.40,29.07,30.70,6.80,16.40,2.25,,
3,Ahmedabad,2015-01-04,,,1.70,18.48,17.97,,1.70,18.59,36.08,4.43,10.14,1.00,,
4,Ahmedabad,2015-01-05,,,22.10,21.42,37.76,,22.10,39.33,39.31,7.01,18.89,2.78,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.07,0.73,41.0,Good
29527,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.21,0.38,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.01,0.00,68.0,Satisfactory
29529,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.00,0.00,54.0,Satisfactory


## Перше ознайомлення з даними

In [5]:
# Подивившись на перші п’ять рядків файлу "city_day.csv" можна помітити кілька втрачених даних
air_data.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


## Визначення кількості відсутніх даних

In [6]:
# Отримуємо кількість відсутніх елементів даних по стовпцях
missing_values_count = air_data.isnull().sum()
# Виводимо кількість відсутніх елементів по стовпцях
missing_values_count

City              0
Date              0
PM2.5          4598
PM10          11140
NO             3582
NO2            3585
NOx            4185
NH3           10328
CO             2059
SO2            3854
O3             4022
Benzene        5623
Toluene        8041
Xylene        18109
AQI            4681
AQI_Bucket     4681
dtype: int64

In [7]:
# Визначаємо загальну кількість відсутніх елементів даних
total_cells = np.product(air_data.shape)
total_missing = missing_values_count.sum()
# Визначаємо відсоток відсутніх елементів даних
percent_missing = (total_missing/total_cells) * 100
# Виводимо відсоток відсутніх елементів даних
percent_missing

18.727777589651552

## Визначення причини відсутності елементів даних

In [8]:
# Виводимо рядки, де всі стовпці, окрім 'City' і 'Date', є відсутніми
missing_data_rows = air_data[air_data.drop(columns=['City', 'Date']).isnull().all(axis=1)]
# Як ми можемо замітити є дати з повністю пустими даними тому можемо зробити висновок, що це дати в які дані не було зібрано
missing_data_rows

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
56,Ahmedabad,2015-02-26,,,,,,,,,,,,,,
57,Ahmedabad,2015-02-27,,,,,,,,,,,,,,
58,Ahmedabad,2015-02-28,,,,,,,,,,,,,,
64,Ahmedabad,2015-03-06,,,,,,,,,,,,,,
262,Ahmedabad,2015-09-20,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29226,Visakhapatnam,2019-09-01,,,,,,,,,,,,,,
29227,Visakhapatnam,2019-09-02,,,,,,,,,,,,,,
29228,Visakhapatnam,2019-09-03,,,,,,,,,,,,,,
29229,Visakhapatnam,2019-09-04,,,,,,,,,,,,,,


## Обробка пропущених даних

In [9]:
# Видаляємо рядки, де всі стовпці, окрім 'City' і 'Date', є відсутніми
air_data_cleaned = air_data.dropna(how='all', subset=air_data.columns[2:])

# Визначимо кількість втрачених даних
print("Кількість рядків: %d \n" % air_data.shape[0])
print("Кількість видалених рядків: %d" % abs(air_data_cleaned.shape[0]-air_data.shape[0]))

Кількість рядків: 29531 

Кількість видалених рядків: 1374


In [10]:
# Заповнення відсутніх елементів даних нулями
air_data_cleaned = air_data_cleaned.fillna(0)
air_data_cleaned

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,0.00,0.00,0.92,18.22,17.15,0.00,0.92,27.64,133.36,0.00,0.02,0.00,0.0,0
1,Ahmedabad,2015-01-02,0.00,0.00,0.97,15.69,16.46,0.00,0.97,24.55,34.06,3.68,5.50,3.77,0.0,0
2,Ahmedabad,2015-01-03,0.00,0.00,17.40,19.30,29.70,0.00,17.40,29.07,30.70,6.80,16.40,2.25,0.0,0
3,Ahmedabad,2015-01-04,0.00,0.00,1.70,18.48,17.97,0.00,1.70,18.59,36.08,4.43,10.14,1.00,0.0,0
4,Ahmedabad,2015-01-05,0.00,0.00,22.10,21.42,37.76,0.00,22.10,39.33,39.31,7.01,18.89,2.78,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.07,0.73,41.0,Good
29527,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.21,0.38,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.01,0.00,68.0,Satisfactory
29529,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.00,0.00,54.0,Satisfactory


In [11]:
# Порахуэмо AQI (https://www.kaggle.com/rohanrao/calculating-aqi-air-quality-index)
def get_PM25_subindex(x):
    if x <= 30:
        return x * 50 / 30
    elif x <= 60:
        return 50 + (x - 30) * 50 / 30
    elif x <= 90:
        return 100 + (x - 60) * 100 / 30
    elif x <= 120:
        return 200 + (x - 90) * 100 / 30
    elif x <= 250:
        return 300 + (x - 120) * 100 / 130
    elif x > 250:
        return 400 + (x - 250) * 100 / 130
    else:
        return 0
    
def get_PM10_subindex(x):
    if x <= 50:
        return x
    elif x <= 100:
        return x
    elif x <= 250:
        return 100 + (x - 100) * 100 / 150
    elif x <= 350:
        return 200 + (x - 250)
    elif x <= 430:
        return 300 + (x - 350) * 100 / 80
    elif x > 430:
        return 400 + (x - 430) * 100 / 80
    else:
        return 0

def get_SO2_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 380:
        return 100 + (x - 80) * 100 / 300
    elif x <= 800:
        return 200 + (x - 380) * 100 / 420
    elif x <= 1600:
        return 300 + (x - 800) * 100 / 800
    elif x > 1600:
        return 400 + (x - 1600) * 100 / 800
    else:
        return 0
    
def get_NOx_subindex(x):
    if x <= 40:
        return x * 50 / 40
    elif x <= 80:
        return 50 + (x - 40) * 50 / 40
    elif x <= 180:
        return 100 + (x - 80) * 100 / 100
    elif x <= 280:
        return 200 + (x - 180) * 100 / 100
    elif x <= 400:
        return 300 + (x - 280) * 100 / 120
    elif x > 400:
        return 400 + (x - 400) * 100 / 120
    else:
        return 0

def get_NH3_subindex(x):
    if x <= 200:
        return x * 50 / 200
    elif x <= 400:
        return 50 + (x - 200) * 50 / 200
    elif x <= 800:
        return 100 + (x - 400) * 100 / 400
    elif x <= 1200:
        return 200 + (x - 800) * 100 / 400
    elif x <= 1800:
        return 300 + (x - 1200) * 100 / 600
    elif x > 1800:
        return 400 + (x - 1800) * 100 / 600
    else:
        return 0
    
def get_CO_subindex(x):
    if x <= 1:
        return x * 50 / 1
    elif x <= 2:
        return 50 + (x - 1) * 50 / 1
    elif x <= 10:
        return 100 + (x - 2) * 100 / 8
    elif x <= 17:
        return 200 + (x - 10) * 100 / 7
    elif x <= 34:
        return 300 + (x - 17) * 100 / 17
    elif x > 34:
        return 400 + (x - 34) * 100 / 17
    else:
        return 0
    
def get_O3_subindex(x):
    if x <= 50:
        return x * 50 / 50
    elif x <= 100:
        return 50 + (x - 50) * 50 / 50
    elif x <= 168:
        return 100 + (x - 100) * 100 / 68
    elif x <= 208:
        return 200 + (x - 168) * 100 / 40
    elif x <= 748:
        return 300 + (x - 208) * 100 / 539
    elif x > 748:
        return 400 + (x - 400) * 100 / 539
    else:
        return 0

def get_AQI_bucket(x):
    if x <= 50:
        return "Good"
    elif x <= 100:
        return "Satisfactory"
    elif x <= 200:
        return "Moderate"
    elif x <= 300:
        return "Poor"
    elif x <= 400:
        return "Very Poor"
    elif x > 400:
        return "Severe"
    else:
        return np.NaN

aqi_bucket_data = pd.DataFrame()
aqi_bucket_data["PM2.5_SubIndex"] = air_data_cleaned["PM2.5"].apply(lambda x: get_PM25_subindex(x))
aqi_bucket_data["PM10_SubIndex"] = air_data_cleaned["PM10"].apply(lambda x: get_PM10_subindex(x))
aqi_bucket_data["SO2_SubIndex"] = air_data_cleaned["SO2"].apply(lambda x: get_SO2_subindex(x))
aqi_bucket_data["NOx_SubIndex"] = air_data_cleaned["NOx"].apply(lambda x: get_NOx_subindex(x))
aqi_bucket_data["NH3_SubIndex"] = air_data_cleaned["NH3"].apply(lambda x: get_NH3_subindex(x))
aqi_bucket_data["CO_SubIndex"] = air_data_cleaned["CO"].apply(lambda x: get_CO_subindex(x))
aqi_bucket_data["O3_SubIndex"] = air_data_cleaned["O3"].apply(lambda x: get_O3_subindex(x))
aqi_bucket_data["Checks"] = (aqi_bucket_data["PM2.5_SubIndex"] > 0).astype(int) + \
                (aqi_bucket_data["PM10_SubIndex"] > 0).astype(int) + \
                (aqi_bucket_data["SO2_SubIndex"] > 0).astype(int) + \
                (aqi_bucket_data["NOx_SubIndex"] > 0).astype(int) + \
                (aqi_bucket_data["NH3_SubIndex"] > 0).astype(int) + \
                (aqi_bucket_data["CO_SubIndex"] > 0).astype(int) + \
                (aqi_bucket_data["O3_SubIndex"] > 0).astype(int)
aqi_bucket_data["AQI_calculated"] = round(aqi_bucket_data[["PM2.5_SubIndex", "PM10_SubIndex", "SO2_SubIndex", "NOx_SubIndex", "NH3_SubIndex", "CO_SubIndex", "O3_SubIndex"]].max(axis = 1))
aqi_bucket_data["AQI_bucket_calculated"] = aqi_bucket_data["AQI_calculated"].apply(lambda x: get_AQI_bucket(x))
aqi_bucket_data.head(10)

Unnamed: 0,PM2.5_SubIndex,PM10_SubIndex,SO2_SubIndex,NOx_SubIndex,NH3_SubIndex,CO_SubIndex,O3_SubIndex,Checks,AQI_calculated,AQI_bucket_calculated
0,0.0,0.0,34.55,21.4375,0.0,46.0,149.058824,4,149.0,Moderate
1,0.0,0.0,30.6875,20.575,0.0,48.5,34.06,4,48.0,Good
2,0.0,0.0,36.3375,37.125,0.0,302.352941,30.7,4,302.0,Very Poor
3,0.0,0.0,23.2375,22.4625,0.0,85.0,36.08,4,85.0,Satisfactory
4,0.0,0.0,49.1625,47.2,0.0,330.0,39.31,4,330.0,Very Poor
5,0.0,0.0,57.2,101.5,0.0,467.117647,46.51,4,467.0,Severe
6,0.0,0.0,40.35,150.77,0.0,859.764706,33.47,4,860.0,Severe
7,0.0,0.0,48.175,116.75,0.0,675.705882,31.89,4,676.0,Severe
8,0.0,0.0,73.35,60.0,0.0,371.529412,25.75,4,372.0,Very Poor
9,0.0,0.0,10.3625,0.0,0.0,0.0,4.55,2,10.0,Good


In [12]:
# Оновлення стовпця AQI, якщо його значення дорівнює 0
air_data_cleaned['AQI'] = air_data_cleaned['AQI'].where(air_data_cleaned['AQI'] != 0, aqi_bucket_data['AQI_calculated'])
# Оновлення стовпця AQI_Bucket, якщо його значення дорівнює 0
air_data_cleaned['AQI_Bucket'] = air_data_cleaned['AQI_Bucket'].where(air_data_cleaned['AQI_Bucket'] != 0, aqi_bucket_data['AQI_bucket_calculated'])

air_data_cleaned

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,0.00,0.00,0.92,18.22,17.15,0.00,0.92,27.64,133.36,0.00,0.02,0.00,149.0,Moderate
1,Ahmedabad,2015-01-02,0.00,0.00,0.97,15.69,16.46,0.00,0.97,24.55,34.06,3.68,5.50,3.77,48.0,Good
2,Ahmedabad,2015-01-03,0.00,0.00,17.40,19.30,29.70,0.00,17.40,29.07,30.70,6.80,16.40,2.25,302.0,Very Poor
3,Ahmedabad,2015-01-04,0.00,0.00,1.70,18.48,17.97,0.00,1.70,18.59,36.08,4.43,10.14,1.00,85.0,Satisfactory
4,Ahmedabad,2015-01-05,0.00,0.00,22.10,21.42,37.76,0.00,22.10,39.33,39.31,7.01,18.89,2.78,330.0,Very Poor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.07,0.73,41.0,Good
29527,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.21,0.38,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.01,0.00,68.0,Satisfactory
29529,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.00,0.00,54.0,Satisfactory


In [13]:
# Зповнення наступним по черзі значенням
air_data.fillna(method='bfill', axis=0).fillna(0)

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,73.24,141.54,0.92,18.22,17.15,26.64,0.92,27.64,133.36,0.00,0.02,0.00,209.0,Poor
1,Ahmedabad,2015-01-02,73.24,141.54,0.97,15.69,16.46,26.64,0.97,24.55,34.06,3.68,5.50,3.77,209.0,Poor
2,Ahmedabad,2015-01-03,73.24,141.54,17.40,19.30,29.70,26.64,17.40,29.07,30.70,6.80,16.40,2.25,209.0,Poor
3,Ahmedabad,2015-01-04,73.24,141.54,1.70,18.48,17.97,26.64,1.70,18.59,36.08,4.43,10.14,1.00,209.0,Poor
4,Ahmedabad,2015-01-05,73.24,141.54,22.10,21.42,37.76,26.64,22.10,39.33,39.31,7.01,18.89,2.78,209.0,Poor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29526,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.07,0.73,41.0,Good
29527,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.21,0.38,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.01,0.00,68.0,Satisfactory
29529,Visakhapatnam,2020-06-30,16.64,49.97,4.05,29.26,18.80,10.03,0.52,9.84,28.30,0.00,0.00,0.00,54.0,Satisfactory


In [14]:
# Видаляємо рядки, де є відсутні елементи
air_data.dropna()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
2123,Amaravati,2017-11-25,81.40,124.50,1.44,20.50,12.08,10.72,0.12,15.24,127.09,0.20,6.50,0.06,184.0,Moderate
2124,Amaravati,2017-11-26,78.32,129.06,1.26,26.00,14.85,10.28,0.14,26.96,117.44,0.22,7.95,0.08,197.0,Moderate
2125,Amaravati,2017-11-27,88.76,135.32,6.60,30.85,21.77,12.91,0.11,33.59,111.81,0.29,7.63,0.12,198.0,Moderate
2126,Amaravati,2017-11-28,64.18,104.09,2.56,28.07,17.01,11.42,0.09,19.00,138.18,0.17,5.02,0.07,188.0,Moderate
2127,Amaravati,2017-11-29,72.47,114.84,5.23,23.20,16.59,12.25,0.16,10.55,109.74,0.21,4.71,0.08,173.0,Moderate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29525,Visakhapatnam,2020-06-26,7.63,32.27,5.91,23.27,17.19,11.15,0.46,6.87,19.90,1.45,5.37,1.45,47.0,Good
29526,Visakhapatnam,2020-06-27,15.02,50.94,7.68,25.06,19.54,12.47,0.47,8.55,23.30,2.24,12.07,0.73,41.0,Good
29527,Visakhapatnam,2020-06-28,24.38,74.09,3.42,26.06,16.53,11.99,0.52,12.72,30.14,0.74,2.21,0.38,70.0,Satisfactory
29528,Visakhapatnam,2020-06-29,22.91,65.73,3.45,29.53,18.33,10.71,0.48,8.42,30.96,0.01,0.01,0.00,68.0,Satisfactory
