In [1]:
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def sturges(data): 
    return int((1 + 3.3 * np.log10(len(data))))

df_smart_tv = pd.read_csv('dataset_smart-tv.csv', usecols = ['date_hour','bytes_up', 'bytes_down'])
df_chromecast = pd.read_csv('dataset_chromecast.csv', usecols = ['date_hour','bytes_up', 'bytes_down'])

df_smart_tv = pd.concat([df_smart_tv["date_hour"], np.log10(df_smart_tv[['bytes_up', 'bytes_down']].replace(0, np.nan))], axis=1)
df_chromecast = pd.concat([df_chromecast["date_hour"], np.log10(df_chromecast[['bytes_up', 'bytes_down']].replace(0, np.nan))], axis=1)

df_smart_tv = df_smart_tv.replace(np.nan, 0)
df_chromecast = df_chromecast.replace(np.nan, 0)

values = df_smart_tv["date_hour"]
converted_values = []
for x in values:
    try:
        converted_values += [datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour]
    except:
        converted_values += [x]

df_smart_tv["date_hour"] = converted_values

values = df_chromecast["date_hour"]
converted_values = []
for x in values:
    try:
        converted_values += [datetime.strptime(x, "%Y-%m-%d %H:%M:%S").hour]
    except:
        converted_values += [x]

df_chromecast["date_hour"] = converted_values

df_smart_tv, df_chromecast

(         date_hour  bytes_up  bytes_down
 0               15  5.123633    6.449962
 1               15  5.063598    6.354955
 2               15  5.057019    6.363475
 3               15  4.987535    6.302449
 4               15  4.597361    5.906413
 ...            ...       ...         ...
 4417898         23  0.903090    0.903090
 4417899         23  0.903090    0.903090
 4417900         23  0.903090    0.903090
 4417901         23  0.903090    0.903090
 4417902         23  0.903090    1.505150
 
 [4417903 rows x 3 columns],
          date_hour  bytes_up  bytes_down
 0                0  3.475238    4.691839
 1                0  2.836283    2.516215
 2                0  3.652624    4.578800
 3                0  2.889936    2.360215
 4                0  3.488736    4.713127
 ...            ...       ...         ...
 1620524         16  3.306211    4.637810
 1620525         16  3.850524    3.201943
 1620526         16  2.688420    4.695762
 1620527         16  0.000000    2.133539
 16

In [30]:
from scipy.stats import chi2_contingency

data_set_1 = df_smart_tv.groupby("date_hour")["bytes_up"].median().argmax()
data_set_2 = df_smart_tv.groupby("date_hour")["bytes_up"].mean().argmax()
data_set_3 = df_smart_tv.groupby("date_hour")["bytes_down"].median().argmax()
data_set_4 = df_smart_tv.groupby("date_hour")["bytes_down"].mean().argmax()
data_set_5 = df_chromecast.groupby("date_hour")["bytes_up"].median().argmax()
data_set_6 = df_chromecast.groupby("date_hour")["bytes_up"].mean().argmax()
data_set_7 = df_chromecast.groupby("date_hour")["bytes_down"].median().argmax()
data_set_8 = df_chromecast.groupby("date_hour")["bytes_down"].mean().argmax()

ds_1 = df_smart_tv.query('date_hour == @data_set_1')['bytes_up']
ds_2 = df_smart_tv.query('date_hour == @data_set_2')['bytes_up']
ds_3 = df_smart_tv.query('date_hour == @data_set_3')['bytes_down']
ds_4 = df_smart_tv.query('date_hour == @data_set_4')['bytes_down']
ds_5 = df_chromecast.query('date_hour == @data_set_5')['bytes_up']
ds_6 = df_chromecast.query('date_hour == @data_set_6')['bytes_up']
ds_7 = df_chromecast.query('date_hour == @data_set_7')['bytes_down']
ds_8 = df_chromecast.query('date_hour == @data_set_8')['bytes_down']


obs_1 = pd.cut(ds_1, bins=sturges(ds_1), include_lowest=True)
obs_3 = pd.cut(ds_3, bins=sturges(ds_1), include_lowest=True)

obs_1 = obs_1.value_counts().sort_index()/obs_1.value_counts().sort_index().sum()
obs_3 = obs_3.value_counts().sort_index()/obs_3.value_counts().sort_index().sum()

g1, p, dof, expctd = chi2_contingency([obs_1, obs_3], lambda_="log-likelihood")

obs_2 = pd.cut(ds_2, bins=sturges(ds_2), include_lowest=True)
obs_4 = pd.cut(ds_4, bins=sturges(ds_2), include_lowest=True)

obs_2 = obs_2.value_counts().sort_index()/obs_2.value_counts().sort_index().sum()
obs_4 = obs_4.value_counts().sort_index()/obs_4.value_counts().sort_index().sum()

g2, p, dof, expctd = chi2_contingency([obs_2, obs_4], lambda_="log-likelihood")

obs_5 = pd.cut(ds_5, bins=sturges(ds_5), include_lowest=True)
obs_7 = pd.cut(ds_7, bins=sturges(ds_5), include_lowest=True)

obs_5 = obs_5.value_counts().sort_index()/obs_5.value_counts().sort_index().sum()
obs_7 = obs_7.value_counts().sort_index()/obs_7.value_counts().sort_index().sum()

g3, p, dof, expctd = chi2_contingency([obs_5, obs_7], lambda_="log-likelihood")

obs_6 = pd.cut(ds_6, bins=sturges(ds_6), include_lowest=True)
obs_8 = pd.cut(ds_6, bins=sturges(ds_6), include_lowest=True)

obs_6 = obs_6.value_counts().sort_index()/obs_6.value_counts().sort_index().sum()
obs_8 = obs_8.value_counts().sort_index()/obs_8.value_counts().sort_index().sum()

g4, p, dof, expctd = chi2_contingency([obs_6, obs_8], lambda_="log-likelihood")

g1, g2, g3, g4

(0.9217860533606511,
 0.9217860533606511,
 0.788016532974326,
 -8.641330879619121e-16)