# Event predictor

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.svm import SVC

In [2]:
input_file = '../data/building_event_binary.txt'
dataset = pd.read_csv(input_file, header=None)
dataset.head()

Unnamed: 0,0,1,2,3,4,5
0,Sunday,07/24/05,00:00:00,0,0,noevent
1,Sunday,07/24/05,00:30:00,1,0,noevent
2,Sunday,07/24/05,01:00:00,0,0,noevent
3,Sunday,07/24/05,01:30:00,0,0,noevent
4,Sunday,07/24/05,02:00:00,0,0,noevent


## Label Encoding

In [3]:
from collections import defaultdict
from sklearn import preprocessing
d = defaultdict(preprocessing.LabelEncoder)

In [4]:
def encode_labels(x):
    """
        Will label encode a column that is non numeric (object by default) 
    """
    print x.name, x.dtype
    if(x.dtype =='object'):
        print 'encoded'
        return d[x.name].fit_transform(x)
    else:
        print 'not encoded'
        return x

def transform_labels(x):
    """
        Will label new column with fitted transform 
    """
    print x.name, x.dtype
    if(x.dtype =='object'):
        print 'encoded'
        return d[x.name].transform(x)
    else:
        print 'not encoded'
        return x

In [5]:
fit = dataset.apply(encode_labels)

0 object
encoded
0 object
encoded
1 object
encoded
2 object
encoded
3 int64
not encoded
4 int64
not encoded
5 object
encoded


In [6]:
fit.head()

Unnamed: 0,0,1,2,3,4,5
0,3,0,0,0,0,1
1,3,0,1,1,0,1
2,3,0,2,0,0,1
3,3,0,3,0,0,1
4,3,0,4,0,0,1


## Train test split

In [7]:
from sklearn.model_selection import train_test_split

X, Y = fit.values[:, :-1], fit.values[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=5)

## Training

In [8]:
params = {'kernel': 'rbf', 'probability': True, 'class_weight': 'balanced'}
classifier = SVC(**params)
classifier.fit(X, Y)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## Evaluation

In [9]:
from sklearn import model_selection

accuracy = model_selection.cross_val_score(classifier,
        X, Y, scoring='accuracy', cv=3)
print 'Accuracy of the classifier: ' + str(100*accuracy.mean()) + '%'


Accuracy of the classifier: 96.4286178261%


## Test

In [14]:
input_data = ['Tuesday', '07/24/05','02:00:00',21,23]
input_data = pd.DataFrame([input_data])
input_data

Unnamed: 0,0,1,2,3,4
0,Tuesday,07/24/05,02:00:00,21,23


In [15]:
input_data = input_data.apply(transform_labels)

0 object
encoded
0 object
encoded
1 object
encoded
2 object
encoded
3 int64
not encoded
4 int64
not encoded


In [17]:
d

defaultdict(sklearn.preprocessing.label.LabelEncoder,
            {0: LabelEncoder(),
             1: LabelEncoder(),
             2: LabelEncoder(),
             5: LabelEncoder()})

In [16]:
classifier.predict(input_data)

array([1])

In [20]:
print d[5].inverse_transform([1])

['noevent']
