In [143]:
%matplotlib inline

In [186]:
import pandas as pd
import numpy as np

## Loading the dataset

In [187]:
air_data = pd.read_csv("datasets/AirQualityUCI.csv", sep=";")

In [188]:
air_data.drop(["Unnamed: 15", "Unnamed: 16"], axis=1, inplace=True)

In [189]:
air_data.dropna(axis=0, inplace=True)

In [190]:
def string_to_float(row):
    for index, value in row.iteritems():
        if (index != "index" and index != "Date" and index != "Time" and not type(value) is float):
            row[index] = float(value.replace(",", "."))
    
    return row
    
air_data = air_data.apply(string_to_float, axis=1)

In [191]:
air_data["no2_cat"] = np.ceil(air_data["NO2(GT)"] / 100)

In [192]:
air_data["no2_cat"].value_counts()

 2.0    4015
 1.0    3314
-2.0    1642
 3.0     377
 4.0       9
Name: no2_cat, dtype: int64

In [193]:
from sklearn.model_selection import StratifiedShuffleSplit

In [194]:
shuffle_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=107)

train_set = None
test_set = None

for train_indexes, test_indexes in shuffle_splitter.split(air_data, air_data["no2_cat"]):
    train_set = air_data.loc[train_indexes]
    test_set = air_data.loc[test_indexes]

In [195]:
train_set.drop(["no2_cat", "Date"], axis=1, inplace=True)

In [196]:
train_set.head()

Unnamed: 0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
4186,04.00.00,0.5,838.0,-200.0,2.8,639.0,72.0,963.0,48.0,1391.0,596.0,20.8,64.9,1.571
8467,13.00.00,2.8,971.0,-200.0,4.5,739.0,181.0,868.0,117.0,972.0,664.0,9.9,43.5,0.53
8587,13.00.00,1.4,1092.0,-200.0,4.9,759.0,268.0,773.0,150.0,1088.0,1039.0,5.3,69.2,0.6213
5992,10.00.00,1.8,883.0,-200.0,6.8,849.0,361.0,910.0,141.0,1053.0,843.0,12.6,36.6,0.5314
3459,21.00.00,1.7,1093.0,-200.0,9.1,944.0,71.0,711.0,85.0,1655.0,785.0,29.5,42.7,1.7316


## Pipelines

In [197]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [198]:
class DataFrameColumnSelector(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns].values

In [199]:
pipeline = Pipeline([
    ("column_selection", DataFrameColumnSelector(train_set.drop(["Time", "CO(GT)"], axis=1).columns.values)),
    ("impute", SimpleImputer(missing_values=-200, strategy="mean")),
    ("scale", StandardScaler())
])

In [200]:
train_set_prepared = pipeline.fit_transform(train_set)
train_set_labels = train_set["CO(GT)"]

## Linear regression model

In [201]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [202]:
regression_scores = cross_val_score(LinearRegression(), train_set_prepared, train_set_labels,
                                scoring="neg_mean_squared_error", cv=10)

regression_scores_rmse = np.sqrt(-regression_scores)

In [203]:
print(regression_scores_rmse)
print(regression_scores_rmse.mean())

[77.72951345 77.27828781 73.24256041 77.01563558 74.5084942  72.07126427
 72.17726834 74.15665574 74.33981504 77.29270073]
74.98121955811483
