In [5]:
import os
import gc
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
%matplotlib inline

In [24]:
end_date = '2019-05-21'
num_days = 11
PATH = '/home/phongdk/data_user_income_targeting/data/'
filename = "daily_histogram.gz"

In [8]:
def load_data(path, filename, nrows=None):
    # be carefull with data, since it could have duplicate user_id in different day
    filepath = os.path.join(path, filename)
    print("Load data from file : {}".format(filepath))

    try:
        df = pd.read_csv(filepath, dtype={'user_id': str}, nrows=nrows)
    except:
        print("---------- Cannot load data from file : {} ---------------".format(filepath))
        df = []
    return df

In [10]:
def to_hour_distribution_df(dfs):
    dfs_by_uid = dfs.groupby(['user_id', 'hour']).sum()#.compute()
    dfs_unstack = dfs_by_uid.unstack(fill_value=0)
    dfs_results = dfs_unstack.reset_index()
    hour_columns = ['h_{}'.format(i) for i in range(24)] #list(np.arange(0,24))
    dfs_results.columns = ['user_id'] + hour_columns
    dfs_results[hour_columns] = dfs_results[hour_columns].div(dfs_results[hour_columns].sum(axis=1),
                                                              axis=0).astype(np.float16)
    dfs_results.set_index("user_id", inplace=True)
    return dfs_results

In [25]:
%%time
dfs = []
for day in range(num_days):
    date = (datetime.strptime(end_date, "%Y-%m-%d") - timedelta(days=day)).strftime("%Y-%m-%d")
    dfs.append(load_data(os.path.join(PATH, date), filename))
df = pd.concat(dfs)
df = to_hour_distribution_df(df)    

Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-21/daily_histogram.gz
Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-20/daily_histogram.gz
Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-19/daily_histogram.gz
Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-18/daily_histogram.gz
Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-17/daily_histogram.gz
Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-16/daily_histogram.gz
Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-15/daily_histogram.gz
Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-14/daily_histogram.gz
Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-13/daily_histogram.gz
Load data from file : /home/phongdk/data_user_income_targeting/data/2019-05-12/daily_histogram.gz
Load data from file 

In [45]:
WORK_STATION_THRESHOLD = 0.96
working_hours = ['h_{}'.format(i) for i in range(8, 20)]  # from 8am to 7pm
print(working_hours)
df['working_proportion'] = df[working_hours].sum(axis=1)
df['work_station'] = (df['working_proportion'] > WORK_STATION_THRESHOLD).astype(np.int8)

['h_8', 'h_9', 'h_10', 'h_11', 'h_12', 'h_13', 'h_14', 'h_15', 'h_16', 'h_17', 'h_18', 'h_19']


In [46]:
print(df['work_station'].sum() * 100 / len(df))

28.874645587856147


In [49]:
np.percentile(df['working_proportion'], 70)

0.96142578125