In [88]:
%matplotlib inline

In [89]:
import pandas as pd
import numpy as np

## Loading the dataset

In [90]:
air_data = pd.read_csv("datasets/AirQualityUCI.csv", sep=";")

In [91]:
air_data.drop(["Unnamed: 15", "Unnamed: 16"], axis=1, inplace=True)

In [92]:
air_data.dropna(axis=0, inplace=True)

In [93]:
def string_to_float(row):
    for index, value in row.iteritems():
        if (index != "index" and index != "Date" and index != "Time" and not type(value) is float):
            row[index] = float(value.replace(",", "."))
    
    return row
    
air_data = air_data.apply(string_to_float, axis=1)

In [94]:
air_data["no2_cat"] = np.ceil(air_data["NO2(GT)"] / 100)

In [95]:
air_data["no2_cat"].value_counts()

 2.0    4015
 1.0    3314
-2.0    1642
 3.0     377
 4.0       9
Name: no2_cat, dtype: int64

In [96]:
from sklearn.model_selection import StratifiedShuffleSplit

In [97]:
shuffle_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=107)

train_set = None
test_set = None

for train_indexes, test_indexes in shuffle_splitter.split(air_data, air_data["no2_cat"]):
    train_set = air_data.loc[train_indexes]
    test_set = air_data.loc[test_indexes]

In [98]:
train_set.drop(["no2_cat", "Date"], axis=1, inplace=True)

In [99]:
train_set.head(n=10)

Unnamed: 0,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
4186,04.00.00,0.5,838.0,-200.0,2.8,639.0,72.0,963.0,48.0,1391.0,596.0,20.8,64.9,1.571
8467,13.00.00,2.8,971.0,-200.0,4.5,739.0,181.0,868.0,117.0,972.0,664.0,9.9,43.5,0.53
8587,13.00.00,1.4,1092.0,-200.0,4.9,759.0,268.0,773.0,150.0,1088.0,1039.0,5.3,69.2,0.6213
5992,10.00.00,1.8,883.0,-200.0,6.8,849.0,361.0,910.0,141.0,1053.0,843.0,12.6,36.6,0.5314
3459,21.00.00,1.7,1093.0,-200.0,9.1,944.0,71.0,711.0,85.0,1655.0,785.0,29.5,42.7,1.7316
5031,09.00.00,-200.0,1528.0,-200.0,25.8,1449.0,-200.0,473.0,-200.0,2039.0,1756.0,20.4,61.2,1.447
7601,11.00.00,4.2,1440.0,-200.0,20.5,1311.0,815.0,482.0,183.0,1476.0,2022.0,9.3,54.5,0.6415
9208,10.00.00,3.3,1303.0,-200.0,14.4,1128.0,460.0,489.0,152.0,1573.0,1541.0,17.7,55.6,1.112
2211,21.00.00,2.4,1056.0,-200.0,12.0,1051.0,163.0,826.0,116.0,1758.0,1078.0,27.5,32.6,1.176
2481,03.00.00,0.2,735.0,-200.0,1.4,533.0,-200.0,1338.0,-200.0,1350.0,493.0,21.0,47.5,1.1666


## Pipelines

In [100]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [101]:
class DataFrameColumnSelector(TransformerMixin, BaseEstimator):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns].values

In [134]:
class RowCleaner():
    def __init__(self, column_actions):
        self.column_actions = column_actions # (index, illegal_value)
    
    def transform(self, X, y):
        transformed_x = []
        transformed_y = []
        
        for row_and_feature in zip(X, y):
            add = True
            for column_action in self.column_actions:
                if (row_and_feature[0][column_action[0]] == column_action[1]):
                    add = False
                    
            if add:
                transformed_x.append(row_and_feature[0])
                transformed_y.append(row_and_feature[1])
        
        return transformed_x, transformed_y

In [127]:
pipeline = Pipeline([
    ("column_selection", DataFrameColumnSelector(train_set.drop(["Time", "CO(GT)"], axis=1).columns.values)),
    #("impute", SimpleImputer(missing_values=-200, strategy="mean")),
    ("scale", StandardScaler())
])

In [132]:
train_set_prepared = pipeline.fit_transform(train_set)
train_set_labels = train_set["CO(GT)"]

array([[-0.6401682 , -0.290162  ,  0.02248666, ...,  0.25496909,
         0.49462197,  0.21607945],
       [-0.23722391, -0.290162  ,  0.06348147, ...,  0.00314889,
         0.07759249,  0.18942349],
       [ 0.1293645 , -0.290162  ,  0.07312731, ..., -0.10312385,
         0.57841761,  0.19176133],
       ...,
       [ 0.15663141, -0.290162  ,  0.26845554, ...,  0.10711136,
        -0.2205641 ,  0.18761314],
       [-0.55533782, -0.290162  ,  0.07553877, ...,  0.12559357,
        -0.405694  ,  0.18408206],
       [ 0.62622814, -0.290162  ,  0.47584107, ...,  0.40051654,
        -0.01204936,  0.21108114]])

## Linear regression model

In [129]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [130]:
regression_scores = cross_val_score(LinearRegression(), train_set_prepared, train_set_labels,
                                scoring="neg_mean_squared_error", cv=10)

regression_scores_rmse = np.sqrt(-regression_scores)

In [131]:
print(regression_scores_rmse)
print(regression_scores_rmse.mean())

[60.95694183 60.87529715 56.52156733 59.40913352 54.81369197 54.3357327
 54.95957143 56.05698351 53.64710392 55.91521934]
56.7491242715574
