In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer


In [2]:
base_path = os.path.join('', 'data')
train_path = os.path.join(base_path, 'Train.csv')
test_path = os.path.join(base_path, 'Test.csv')

In [3]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [4]:
train_df['pm2_5'].isnull().sum()

0

## EDA

In [5]:
# shape of the train and test datasets
print(f'train_df shape {train_df.shape} and test_df shape: {test_df.shape}')

train_df shape (8071, 80) and test_df shape: (2783, 79)


In [6]:

# removing variables with more than 45% missing values
drop_cols = [i for i in train_df.columns if train_df[i].isnull().sum() / len(train_df) > 0.45]
print(f'the number of columns to drop is {len(np.array(drop_cols))}')

the number of columns to drop is 36


In [7]:
train_df.drop(drop_cols, axis=1, inplace=True)

In [8]:
null_cols = [i for i in train_df.columns if train_df[i].isnull().sum() > 1]
print(null_cols, end='\n')
print(f'total number of columns with missing values is {len(np.array(null_cols))}')

['carbonmonoxide_co_column_number_density', 'carbonmonoxide_h2o_column_number_density', 'carbonmonoxide_cloud_height', 'carbonmonoxide_sensor_altitude', 'carbonmonoxide_sensor_azimuth_angle', 'carbonmonoxide_sensor_zenith_angle', 'carbonmonoxide_solar_azimuth_angle', 'carbonmonoxide_solar_zenith_angle', 'uvaerosolindex_absorbing_aerosol_index', 'uvaerosolindex_sensor_altitude', 'uvaerosolindex_sensor_azimuth_angle', 'uvaerosolindex_sensor_zenith_angle', 'uvaerosolindex_solar_azimuth_angle', 'uvaerosolindex_solar_zenith_angle', 'ozone_o3_column_number_density', 'ozone_o3_column_number_density_amf', 'ozone_o3_slant_column_number_density', 'ozone_o3_effective_temperature', 'ozone_cloud_fraction', 'ozone_sensor_azimuth_angle', 'ozone_sensor_zenith_angle', 'ozone_solar_azimuth_angle', 'ozone_solar_zenith_angle', 'cloud_cloud_fraction', 'cloud_cloud_top_pressure', 'cloud_cloud_top_height', 'cloud_cloud_base_pressure', 'cloud_cloud_base_height', 'cloud_cloud_optical_depth', 'cloud_surface_alb

In [9]:
# find the dtype of the missing columns
d_types = []
for dt in null_cols:
    d_types.append(train_df[dt].dtypes)

print(d_types, end='\n')
print(f'there are {d_types.count(d_types[0])} missing values with data type{d_types[0]}')

[dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64'), dtype('float64')]
there are 34 missing values with data typefloat64


We can see that all of the missing values are continous in nature therefore we can use imputation methods like:...(to explore later)

In [10]:
from sklearn.impute import SimpleImputer

def impute_variables(df, numeric_variables):
    num_imputer = SimpleImputer(strategy='mean')
    df[numeric_variables] = num_imputer.fit_transform(df[numeric_variables])
    return df
train_df_imputed = impute_variables(df=train_df, numeric_variables=null_cols)


In [11]:
train_df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8071 entries, 0 to 8070
Data columns (total 44 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   id                                        8071 non-null   object 
 1   site_id                                   8071 non-null   object 
 2   site_latitude                             8071 non-null   float64
 3   site_longitude                            8071 non-null   float64
 4   city                                      8071 non-null   object 
 5   country                                   8071 non-null   object 
 6   date                                      8071 non-null   object 
 7   hour                                      8071 non-null   int64  
 8   month                                     8071 non-null   float64
 9   carbonmonoxide_co_column_number_density   8071 non-null   float64
 10  carbonmonoxide_h2o_column_number_den

In [13]:
# Spliting the data into train, validation and test set
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(train_df_imputed, test_size=0.2, random_state=17)

df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=17)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['pm2_5']
y_val = df_val['pm2_5']
y_test = df_test['pm2_5']

del df_train['pm2_5']
del df_val['pm2_5']
del df_test['pm2_5']

In [14]:
print(f'shapes of the train, validation and test set are: {df_train.shape, df_val.shape, df_test.shape}')

shapes of the train, validation and test set are: ((4842, 43), (1614, 43), (1615, 43))


In [15]:
# Applying a Dictvectorizer to the  data

dv = DictVectorizer(sparse=True)
x_train_dv= dv.fit_transform(df_train.to_dict(orient='records'))
x_val_dv= dv.transform(df_val.to_dict(orient='records'))
x_test_dv= dv.transform(df_test.to_dict(orient='records'))


In [17]:
# Creating a Baseline model 
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [18]:
model = RandomForestRegressor(n_estimators=10, random_state=17, n_jobs=-1)
model.fit(x_train_dv, y_train)

#rmse
y_val_preds = model.predict(x_val_dv)
rmse = np.sqrt(mean_squared_error(y_val, y_val_preds))
print(f"{rmse:.2f}")


14.25


In [None]:
# This will be our baseline model
# The goal is to make thi model perform better