# Framing the dataset to supervised learning problem and normalizing the input variables

**Import library's**

In [5]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

**Load dataset, and get only values**

In [2]:
dataset = pd.read_csv("pollution.csv", header=0, index_col=0)
values = dataset.values

In [4]:
values.shape

(43800, 8)

**Preprocessing: integer encode**

In [6]:
encoder = preprocessing.LabelEncoder()

In [7]:
values[:,4] = encoder.fit_transform(values[:,4])

**Transform all values in float and normalize features**

In [8]:
values = values.astype('float32')

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

**Use series_to_supervised() function to framing the dataset in supervised learning problem, and drop rows with NaN values**

In [13]:
def series_to_supervised(dataset, n_in=1, n_out=1, dropna=True):
    
    n_vars = 1 if type(dataset) is list else dataset.shape[1]
    
    df = pd.DataFrame(dataset)
    
    cols, names = list(), list()
    
    # input sequence(t-n, ..., t-1)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        names += [("var%d(t-%d)"% (j+1, i)) for j in range(n_vars)]
        
    # forecast sequence (t, t+1, ..., t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        
        if i == 0:
            names += [("var%d(t)"%(j+1)) for j in range(n_vars)]
        else:
            names += [("var%d(t+%d)"%(j+1, i)) for j in range(n_vars)]
            
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropna:
        agg.dropna(inplace=True)
    
    return agg

In [14]:
reframed = series_to_supervised(scaled)