### Import libraries

In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)  # None means unlimited columns

### Read dataframe

In [2]:
df = pd.read_csv("../data/datasets/dataset_merged.csv")

### Data inspection

In [3]:
initial_rows = df.shape[0]
initial_cols = df.shape[1]

In [4]:
# Check for datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25864 entries, 0 to 25863
Data columns (total 13 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Luna/Zi                                 25864 non-null  object 
 1   County                                  25864 non-null  object 
 2   42i - NO2_Valoare [µg/m³]               9682 non-null   float64
 3   43i - SO2_Valoare [µg/m³]               10839 non-null  float64
 4   48i - CO_Valoare [mg/m³]                8367 non-null   float64
 5   GRAV. 10 - PM 10_Valoare [µg/m³]        24055 non-null  float64
 6   GRAV. 2.5 - PM 2.5_Valoare [µg/m³]      18154 non-null  float64
 7   MTX - Presiunea aerului_Valoare [mbar]  25431 non-null  float64
 8   MTX - Temperatura aer_Valoare [°C]      25645 non-null  float64
 9   MTX - Umiditate relativa_Valoare [%]    25129 non-null  float64
 10  article_date                            25864 non-null  ob

In [5]:
# Check for basic statistical measures
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
42i - NO2_Valoare [µg/m³],9682.0,20.781629,13.629018,-6.79,10.5625,17.75,27.751812,171.54
43i - SO2_Valoare [µg/m³],10839.0,8.051944,4.384233,-0.275,4.96,6.746667,10.601667,105.01
48i - CO_Valoare [mg/m³],8367.0,0.678921,0.492437,-0.303333,0.39,0.57,0.8125,5.02
GRAV. 10 - PM 10_Valoare [µg/m³],24055.0,26.929215,14.887876,0.97,17.26,23.96,32.855,230.285714
GRAV. 2.5 - PM 2.5_Valoare [µg/m³],18154.0,17.087652,12.181133,0.0,9.61,13.96,20.89,198.31
MTX - Presiunea aerului_Valoare [mbar],25431.0,990.103609,29.271938,805.7,975.033333,1000.5,1008.34,1100.0
MTX - Temperatura aer_Valoare [°C],25645.0,12.409086,9.203435,-19.123333,5.05,12.58,20.3,38.9325
MTX - Umiditate relativa_Valoare [%],25129.0,74.144727,15.292521,6.0,63.5,75.166667,86.0,100.0


In [6]:
# Calculate the percentage of missing values for each column
missing_percentage = df.isna().mean().round(4) * 100

# Print columns with more than 25% missing values
for col, missing in missing_percentage.items():
    if missing > 20:
        print(f"Column {col} has {missing}% missing values.")

Column 42i - NO2_Valoare [µg/m³] has 62.57% missing values.
Column 43i - SO2_Valoare [µg/m³] has 58.089999999999996% missing values.
Column 48i - CO_Valoare [mg/m³] has 67.65% missing values.
Column GRAV. 2.5 - PM 2.5_Valoare [µg/m³] has 29.81% missing values.


- We can see that columns 42i - NO2_Valoare [µg/m³], 43i - SO2_Valoare [µg/m³] and 48i - CO_Valoare [mg/m³] have more than 50% missing values so we will decide to drop them.
- For the rest of the columns, we will just drop the rows since I don't believe in polluting the missing values with imputation methods like ffill, bfill, mean, median, etc.

### Clean dataset

In [7]:
df.drop(['42i - NO2_Valoare [µg/m³]', '43i - SO2_Valoare [µg/m³]', '48i - CO_Valoare [mg/m³]', 'GRAV. 2.5 - PM 2.5_Valoare [µg/m³]', 'article_date'], axis=1, inplace=True)
df = df.dropna().reset_index(drop=True)

In [8]:
actual_rows = df.shape[0]
actual_cols = df.shape[1]

In [9]:
print(f'Lost rows: {initial_rows-actual_rows} ({round((initial_rows-actual_rows)/initial_rows*100, 3)}%)')
print(f'Lows columns {initial_cols-actual_cols} ({round((initial_cols-actual_cols)/initial_cols*100, 3)}%)')

Lost rows: 2542 (9.828%)
Lows columns 5 (38.462%)


We can assume that even though some data is lost, we still have a sufficiently large number of rows to perform a conclusive analysis.

### Reorder dataframe columns

In [10]:
# Reorder columns
new_column_order = ['Luna/Zi', 'region', 'County', 'phenomena',
                    'GRAV. 10 - PM 10_Valoare [µg/m³]', 'MTX - Presiunea aerului_Valoare [mbar]', 
                    'MTX - Temperatura aer_Valoare [°C]', 'MTX - Umiditate relativa_Valoare [%]']

df = df[new_column_order]

### Export this dataset

In [11]:
df.to_csv("../data/datasets/dataset_cleaned.csv", index=False)