# Framing the dataset to supervised learning problem and normalizing the input variables

**Import library's**

In [17]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

**Load dataset, and get only values**

In [18]:
dataset = pd.read_csv("pollution.csv", header=0, index_col=0)
values = dataset.values

In [19]:
values.shape

(43800, 8)

**Preprocessing: integer encode**

In [20]:
encoder = preprocessing.LabelEncoder()

In [21]:
values[:,4] = encoder.fit_transform(values[:,4])

**Transform all values in float and normalize features**

In [22]:
values = values.astype('float32')

scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

**Use series_to_supervised() function to framing the dataset in supervised learning problem, and drop rows with NaN values**

In [23]:
def series_to_supervised(dataset, n_in=1, n_out=1, dropna=True):
    
    n_vars = 1 if type(dataset) is list else dataset.shape[1]
    
    df = pd.DataFrame(dataset)
    
    cols, names = list(), list()
    
    # input sequence(t-n, ..., t-1)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        names += [("var%d(t-%d)"% (j+1, i)) for j in range(n_vars)]
        
    # forecast sequence (t, t+1, ..., t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        
        if i == 0:
            names += [("var%d(t)"%(j+1)) for j in range(n_vars)]
        else:
            names += [("var%d(t+%d)"%(j+1, i)) for j in range(n_vars)]
            
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropna:
        agg.dropna(inplace=True)
    
    return agg

In [24]:
reframed = series_to_supervised(scaled)

**drop columns not predict**

In [25]:
reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)

In [26]:
reframed.head(5)

Unnamed: 0,var1(t-0),var2(t-0),var3(t-0),var4(t-0),var5(t-0),var6(t-0),var7(t-0),var8(t-0),var1(t)
0,0.129779,0.352941,0.245902,0.527273,0.666667,0.00229,0.0,0.0,0.129779
1,0.148893,0.367647,0.245902,0.527273,0.666667,0.003811,0.0,0.0,0.148893
2,0.15996,0.426471,0.229508,0.545454,0.666667,0.005332,0.0,0.0,0.15996
3,0.182093,0.485294,0.229508,0.563637,0.666667,0.008391,0.037037,0.0,0.182093
4,0.138833,0.485294,0.229508,0.563637,0.666667,0.009912,0.074074,0.0,0.138833


**Save the new dataset**

In [27]:
reframed.to_csv("pollution.csv")