In [1]:
# Pandas, NumPy, matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# MySQL
import mysql.connector as sql
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from sklearn.preprocessing import MinMaxScaler
output_notebook()

## Connect to DB

In [2]:
con = sql.connect(host='localhost', database='secom', user='root', password='1234')

## Load data from table_secom

In [3]:
df_table_secom = pd.read_sql('SELECT * FROM table_secom', con=con)
list_imp_vars = ['v65','v60','v427','v104','v66','v17','v154','v66','v442','v39','v268','v407']
df_table_secom.head()

Unnamed: 0,id,results,timestamp,v1,v2,v3,v4,v5,v6,v7,...,v577,v578,v583,v584,v585,v586,v587,v588,v589,v590
0,2,1,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,...,2.0952,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
1,3,-1,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,...,1.7585,8.5831,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
2,5,-1,2008-07-19 17:53:00,2946.25,2432.84,2233.3667,1326.52,1.5334,100.0,100.3967,...,1.6679,13.7755,0.4949,0.0189,0.0044,3.8276,0.0342,0.0151,0.0052,44.0077
3,7,-1,2008-07-19 19:45:00,3058.88,2690.15,2248.9,1004.4692,0.7884,100.0,106.24,...,56.4274,16.0862,0.4984,0.0106,0.0034,2.1261,0.0204,0.0194,0.0063,95.031
4,8,-1,2008-07-19 20:24:00,2967.68,2600.47,2248.9,1004.4692,0.7884,100.0,106.24,...,1.3248,14.2892,0.4993,0.0172,0.0046,3.4456,0.0111,0.0124,0.0045,111.6525


In [4]:
df_weeks = df_table_secom.loc[:,['timestamp','results']]
df_weeks['month'] = df_weeks['timestamp'].dt.month
df_weeks['dayofweek'] = df_weeks['timestamp'].dt.dayofweek
df_week_behavior = df_weeks[['month', 'dayofweek', 'results']].groupby(['month', 'dayofweek']).agg(['count'])
df_week_behavior.columns = ["_".join(x) for x in df_week_behavior.columns.ravel()]
df_week_behavior['perc_total'] = df_week_behavior['results_count']/df_week_behavior['results_count'].sum() * 100
df_week_behavior['count_passed'] = df_weeks[df_weeks['results'] == -1][['month', 'dayofweek', 'results']].groupby(['month', 'dayofweek']).agg(['count'])
df_week_behavior['perc_sel_passed'] = df_week_behavior['count_passed']/df_week_behavior['results_count'] * 100
df_week_behavior['perc_total_passed'] = df_week_behavior['count_passed']/df_week_behavior['results_count'].sum() * 100
df_week_behavior['count_failed'] = df_weeks[df_weeks['results'] == 1][['month', 'dayofweek', 'results']].groupby(['month', 'dayofweek']).agg(['count'])
df_week_behavior['perc_sel_failed'] = df_week_behavior['count_failed']/df_week_behavior['results_count'] * 100
df_week_behavior['perc_total_failed'] = df_week_behavior['count_failed']/df_week_behavior['results_count'].sum() * 100

In [5]:
df_week_behavior.sort_values(['perc_sel_failed'], ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,results_count,perc_total,count_passed,perc_sel_passed,perc_total_passed,count_failed,perc_sel_failed,perc_total_failed
month,dayofweek,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7,4,1,0.082713,,,,1.0,100.0,0.082713
7,5,8,0.661704,5.0,62.5,0.413565,3.0,37.5,0.248139
7,0,10,0.82713,7.0,70.0,0.578991,3.0,30.0,0.248139
5,5,30,2.48139,24.0,80.0,1.985112,6.0,20.0,0.496278
7,1,20,1.65426,16.0,80.0,1.323408,4.0,20.0,0.330852


In [6]:
df_week_behavior.sort_values(['perc_total_failed'], ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,results_count,perc_total,count_passed,perc_sel_passed,perc_total_passed,count_failed,perc_sel_failed,perc_total_failed
month,dayofweek,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8,4,71,5.872622,65.0,91.549296,5.376344,6.0,8.450704,0.496278
5,5,30,2.48139,24.0,80.0,1.985112,6.0,20.0,0.496278
2,6,34,2.812242,29.0,85.294118,2.398677,5.0,14.705882,0.413565
6,1,45,3.722084,40.0,88.888889,3.308519,5.0,11.111111,0.413565
8,0,30,2.48139,25.0,83.333333,2.067825,5.0,16.666667,0.413565


In [7]:
df_months = df_table_secom.loc[:,['timestamp','results']]
df_months['month'] = df_weeks['timestamp'].dt.month
df_months_behavior = df_weeks[['month','results']].groupby(['month']).agg(['count'])
df_months_behavior.columns = ["_".join(x) for x in df_months_behavior.columns.ravel()]
df_months_behavior['count_passed'] = df_months[df_months['results'] == -1][['month', 'results']].groupby(['month']).agg(['count'])
df_months_behavior['count_failed'] = df_months[df_months['results'] == 1][['month', 'results']].groupby(['month']).agg(['count'])
df_months_behavior['perc_sel_passed'] = df_months_behavior['count_passed']/df_months_behavior['results_count'] * 100
df_months_behavior['perc_sel_failed'] = df_months_behavior['count_failed']/df_months_behavior['results_count'] * 100

In [8]:
df_months_behavior.sort_values(['perc_sel_failed'], ascending=False)

Unnamed: 0_level_0,results_count,count_passed,count_failed,perc_sel_passed,perc_sel_failed
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,98,83,15.0,84.693878,15.306122
5,47,40,7.0,85.106383,14.893617
6,57,51,6.0,89.473684,10.526316
4,37,34,3.0,91.891892,8.108108
11,25,23,2.0,92.0,8.0
8,308,284,24.0,92.207792,7.792208
2,65,60,5.0,92.307692,7.692308
10,94,89,5.0,94.680851,5.319149
9,374,358,16.0,95.721925,4.278075
1,35,34,1.0,97.142857,2.857143


In [9]:
df_month_var = df_table_secom.loc[:,list_imp_vars]
scaler = MinMaxScaler()
df_month_var_scaled = pd.DataFrame(scaler.fit_transform(df_month_var), columns=df_month_var.columns)
df_month_var_scaled['month'] = df_table_secom.loc[:,['timestamp']]['timestamp'].dt.month

In [10]:
df_month_var = pd.melt(df_month_var_scaled.reset_index(), id_vars=['month'], value_vars=list_imp_vars, var_name='variable', value_name='value')

In [11]:
df_month_description = df_month_var.groupby(['month','variable']).describe()
df_month_description.columns = df_month_description.columns.droplevel(0)
df_month_description = df_month_description.reset_index()
df_month_description = pd.merge(df_month_description, df_months_behavior,  how='left', left_on=['month'], right_index=True)
df_month_description = df_month_description.sort_values(['month'])
df_month_description.head()

Unnamed: 0,month,variable,count,mean,std,min,25%,50%,75%,max,results_count,count_passed,count_failed,perc_sel_passed,perc_sel_failed
0,1,v104,35.0,0.386119,0.080859,0.202429,0.350202,0.376518,0.437247,0.538462,35,34,1.0,97.142857,2.857143
1,1,v154,35.0,0.057337,0.165355,0.0034,0.016362,0.025924,0.042711,1.0,35,34,1.0,97.142857,2.857143
2,1,v17,35.0,0.625112,0.055082,0.509694,0.594333,0.620258,0.654078,0.733354,35,34,1.0,97.142857,2.857143
3,1,v268,35.0,0.197246,0.204538,0.054348,0.063768,0.081884,0.308696,0.722464,35,34,1.0,97.142857,2.857143
4,1,v39,35.0,0.567411,0.132327,0.405578,0.485052,0.542525,0.595209,0.96362,35,34,1.0,97.142857,2.857143


In [12]:
plt_month_mean = figure(width=1000) 
color = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']
x_values = df_month_description['month'].unique()
for i, var in enumerate(list_imp_vars):
    y_values = list(df_month_description[df_month_description['variable'] == var]['mean'])
    plt_month_mean.line(x_values, y_values, legend=var, line_width=2, color=color[i])
show(plt_month_mean)