# 1) Importing Dependencies

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import Normalizer, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# 2) Utility Funcitions

In [2]:
def load_df(file):
    """Loads the Dataset"""

    df = pd.read_csv(file, delimiter=';')
    return df

def fix_target_column(y):
    """Fixing the ',' in the last column to '.'"""

    y = y.apply(lambda x: x.replace({',':'.'}, regex=True))
    y['Slowness in traffic (%)'] = np.array(y['Slowness in traffic (%)'], dtype='float64')
    return y

def fix_hour_column(y):
    """Flooring values above 24"""

    y['Hour (Coded)'] = y['Hour (Coded)'].apply(lambda x : 24 if x > 24 else x)
    return y

def encode_hour_column(df):
    """Encode the hour column into day and night"""
    
    df = fix_hour_column(df)

    # Extracting column
    hours = np.array(df['Hour (Coded)']).tolist()

    # encoding day
    day = np.array([1 if i>=6 and i<=18 else 0 for i in hours])

    # encoding night
    night = np.array([0 if i==1 else 1 for i in day])

    # Removing 'Hour' Column
    df = df.iloc[:, 1:]

    # inserting both columns at front
    df.insert(loc = 0,
            column = 'Day',
            value = day)

    df.insert(loc = 0,
            column = 'Night',
            value = night)

    return df
    
def normalize(df):
    """Min-Max Scaling on the DataFrame"""

    # Saving the Column names
    cols = list(df.columns)

    # Extracting values
    x = df.values

    # Normalising entire DataFrame
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)

    # Renaming Columns
    df.set_axis(cols, axis="columns", inplace=True)

    return df

# 3) Loading and Pre-Processing of Data

In [3]:
df = load_df('brazil_traffic.csv')
df

Unnamed: 0,Hour (Coded),Immobilized bus,Broken Truck,Vehicle excess,Accident victim,Running over,Fire vehicles,Occurrence involving freight,Incident involving dangerous freight,Lack of electricity,Fire,Point of flooding,Manifestations,Defect in the network of trolleybuses,Tree on the road,Semaphore off,Intermittent Semaphore,Slowness in traffic (%)
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,41
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,66
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,87
3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,92
4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,23,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,178
131,24,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,181
132,25,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,177
133,26,0,4,0,0,0,0,0,0,0,0,0,1,0,0,1,0,174


In [4]:
X = df.iloc[:, :-1]
X = encode_hour_column(X)
X = normalize(X)

y = df.iloc[:, -1:]
y = fix_target_column(y)
y

Unnamed: 0,Slowness in traffic (%)
0,4.1
1,6.6
2,8.7
3,9.2
4,11.1
...,...
130,17.8
131,18.1
132,17.7
133,17.4


In [5]:
X.head(25)

Unnamed: 0,Night,Day,Immobilized bus,Broken Truck,Vehicle excess,Accident victim,Running over,Fire vehicles,Occurrence involving freight,Incident involving dangerous freight,Lack of electricity,Fire,Point of flooding,Manifestations,Defect in the network of trolleybuses,Tree on the road,Semaphore off,Intermittent Semaphore
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Introducing Polynomial features

In [6]:
def PolynomialFeatures_labeled(input_df, power):
    '''Basically this is a cover for the sklearn preprocessing function. 
    The problem with that function is if you give it a labeled dataframe, it ouputs an unlabeled dataframe with potentially
    a whole bunch of unlabeled columns. 
    Inputs:
    input_df = Your labeled pandas dataframe (list of x's not raised to any power) 
    power = what order polynomial you want variables up to. (use the same power as you want entered into pp.PolynomialFeatures(power) directly)
    Ouput:
    Output: This function relies on the powers_ matrix which is one of the preprocessing function's outputs to create logical labels and 
    outputs a labeled pandas dataframe   
    '''
    poly = PolynomialFeatures(power)
    output_nparray = poly.fit_transform(input_df)
    powers_nparray = poly.powers_

    input_feature_names = list(input_df.columns)
    target_feature_names = ["Constant Term"]
    for feature_distillation in powers_nparray[1:]:
        intermediary_label = ""
        final_label = ""
        for i in range(len(input_feature_names)):
            if feature_distillation[i] == 0:
                continue
            else:
                variable = input_feature_names[i]
                power = feature_distillation[i]
                intermediary_label = "%s^%d" % (variable,power)
                if final_label == "":         #If the final label isn't yet specified
                    final_label = intermediary_label
                else:
                    final_label = final_label + " x " + intermediary_label
        target_feature_names.append(final_label)
    output_df = pd.DataFrame(output_nparray, columns = target_feature_names)
    return output_df


In [17]:
k = 1

# X = PolynomialFeatures_labeled(X, 1)
# X_poly = PolynomialFeatures_labeled(X, k)
# X_poly

poly = PolynomialFeatures(k)
df = pd.DataFrame(poly.fit_transform(X))
df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
1,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
2,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
3,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
4,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,1.0,1.0,1.0,0.0,0.25,0.0,0.0,0.000000,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.00,0.0
131,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
132,1.0,1.0,1.0,0.0,0.25,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
133,1.0,1.0,1.0,0.0,0.00,0.8,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000,0.0,0.25,0.0


In [8]:
X

Unnamed: 0,Constant Term,Night^1,Day^1,Immobilized bus^1,Broken Truck^1,Vehicle excess^1,Accident victim^1,Running over^1,Fire vehicles^1,Occurrence involving freight^1,Incident involving dangerous freight^1,Lack of electricity^1,Fire^1,Point of flooding^1,Manifestations^1,Defect in the network of trolleybuses^1,Tree on the road^1,Semaphore off^1,Intermittent Semaphore^1
0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
1,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
2,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
3,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
4,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,1.0,1.0,0.0,0.25,0.0,0.0,0.000000,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.00,0.0
131,1.0,1.0,0.0,0.00,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
132,1.0,1.0,0.0,0.25,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
133,1.0,1.0,0.0,0.00,0.8,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000,0.0,0.25,0.0


# 4) Test-Train Split

We will test on different splits - 70:30, 80:20, 90:10 

In [9]:
# Normal
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12)

In [10]:
# Polynomial fit
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(
    X_poly, y, test_size=0.2, random_state=12)

In [11]:
X

Unnamed: 0,Constant Term,Night^1,Day^1,Immobilized bus^1,Broken Truck^1,Vehicle excess^1,Accident victim^1,Running over^1,Fire vehicles^1,Occurrence involving freight^1,Incident involving dangerous freight^1,Lack of electricity^1,Fire^1,Point of flooding^1,Manifestations^1,Defect in the network of trolleybuses^1,Tree on the road^1,Semaphore off^1,Intermittent Semaphore^1
0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
1,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
2,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
3,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
4,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,1.0,1.0,0.0,0.25,0.0,0.0,0.000000,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.00,0.0
131,1.0,1.0,0.0,0.00,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
132,1.0,1.0,0.0,0.25,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.00,0.0
133,1.0,1.0,0.0,0.00,0.8,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000,0.0,0.25,0.0


In [12]:
X_poly

Unnamed: 0,Constant Term,Constant Term^1,Night^1^1,Day^1^1,Immobilized bus^1^1,Broken Truck^1^1,Vehicle excess^1^1,Accident victim^1^1,Running over^1^1,Fire vehicles^1^1,...,Tree on the road^1^3,Tree on the road^1^2 x Semaphore off^1^1,Tree on the road^1^2 x Intermittent Semaphore^1^1,Tree on the road^1^1 x Semaphore off^1^2,Tree on the road^1^1 x Semaphore off^1^1 x Intermittent Semaphore^1^1,Tree on the road^1^1 x Intermittent Semaphore^1^2,Semaphore off^1^3,Semaphore off^1^2 x Intermittent Semaphore^1^1,Semaphore off^1^1 x Intermittent Semaphore^1^2,Intermittent Semaphore^1^3
0,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130,1.0,1.0,1.0,0.0,0.25,0.0,0.0,0.000000,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
131,1.0,1.0,1.0,0.0,0.00,0.0,0.0,0.666667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
132,1.0,1.0,1.0,0.0,0.25,0.0,0.0,0.666667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
133,1.0,1.0,1.0,0.0,0.00,0.8,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.0


# 5) Fitting models

In [13]:
# Linear Model
print("Results of Linear Model")
reg = LinearRegression().fit(X_train, y_train)
print(reg.score(X_test, y_test))

print("\nResults of Polynomial Model")
reg_poly = LinearRegression().fit(X_train_poly, y_train_poly)
print(reg_poly.score(X_test, y_test))

Results of Linear Model
0.20649771071675005

Results of Polynomial Model


Feature names unseen at fit time:
- Accident victim^1
- Broken Truck^1
- Day^1
- Defect in the network of trolleybuses^1
- Fire vehicles^1
- ...
Feature names seen at fit time, yet now missing:
- Accident victim^1^1
- Accident victim^1^1 x Defect in the network of trolleybuses^1^1
- Accident victim^1^1 x Defect in the network of trolleybuses^1^1 x Intermittent Semaphore^1^1
- Accident victim^1^1 x Defect in the network of trolleybuses^1^1 x Semaphore off^1^1
- Accident victim^1^1 x Defect in the network of trolleybuses^1^1 x Tree on the road^1^1
- ...



ValueError: X has 19 features, but LinearRegression is expecting 1540 features as input.