In [84]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from datetime import timedelta

## Prepare data

Divide the timestamp into two features **weekday** and **secondes** and transform the **action** in a numeric feature.

In [85]:
def get_time_in_seconds(time):
    return int(timedelta(hours=time.hour, minutes=time.minute).total_seconds())

def prepare_data(data):
    df['action']= df['action'].apply(lambda x: 1 if x == 'on' else 0)
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s', origin='unix')
    df['weekday'] = df['datetime'].apply(lambda x: x.weekday())
    df['time'] = df['datetime'].apply(lambda x: x.time())  
    df['seconds'] = df['time'].apply(get_time_in_seconds)  
    return df[['action', 'weekday', 'seconds']]

## Read data into a DataFrame

In [86]:
df = pd.read_csv('data/socket_log.csv', index_col='index')
socket_log = prepare_data(df)
socket_log.head()

Unnamed: 0_level_0,action,weekday,seconds
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0,26520
1,0,0,29160
2,1,0,61260
3,0,0,82740
4,1,1,26520


## Create test and training set

In [87]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(socket_log, socket_log['weekday']):
    strat_train_set = socket_log.loc[train_index]
    strat_test_set = socket_log.loc[test_index]

## Extract labels from features

In [98]:
X = strat_train_set.drop('action', axis=1)
y = strat_train_set['action'].copy()

index
4      1
82     1
60     1
47     0
38     1
96     1
52     1
89     0
5      0
62     1
76     1
34     1
91     0
44     1
87     0
101    0
37     0
10     1
46     1
15     0
67     0
72     1
54     1
102    1
2      1
49     0
40     1
29     0
39     0
69     0
      ..
24     1
84     1
42     1
18     1
95     0
66     1
30     1
31     0
80     1
57     0
93     0
33     0
88     1
28     1
36     1
92     1
35     0
74     1
1      0
77     0
32     1
97     0
58     1
23     0
8      1
22     1
51     0
16     1
73     0
75     0
Name: action, Length: 83, dtype: int64

## Train data with DecisionTreeClassifier

In [110]:
from sklearn.tree import DecisionTreeClassifier

socket_clf = DecisionTreeClassifier(min_samples_split=10, min_samples_leaf=10)
socket_clf.fit(X, y)

socket_clf.predict([[3, 29160]])
socket_clf.tree_.feature

array([ 1,  1,  1, -2, -2, -2, -2])

In [111]:
from sklearn.tree import export_graphviz

export_graphviz(
    decision_tree=socket_clf,
    out_file='socket_tree.dot',
    feature_names=['weekday', 'seconds'],
    class_names=True,
    rounded=True,
    filled=True
)

In [103]:
from sklearn.model_selection import cross_val_score

cross_val_score(socket_clf, X, y, cv=5, scoring='accuracy')

array([ 0.77777778,  0.58823529,  0.6875    ,  0.75      ,  0.75      ])

In [96]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(socket_clf, X, y, cv=3)

confusion_matrix(y, y_train_pred)

array([[23, 18],
       [ 0, 42]])