Neste notebook:
* Avaliação de usuários por bucket  
    * em média, e mediana, em quantos buckets os utilizadores estão.  
    * quantos usuários novos estão em cada bucket e quantos não ocorrem

Objetivo:
* explorar caracteristicas dos datasets que permitam
    * avaliar se o dataset é util para a análise  
    * escolher o número de buckets  

Vamos definir o numero de itens em cada bucket como 50k

In [None]:
import os
import sys
sys.path.append(os.path.abspath('') + '/..')

In [None]:
from data import ImplicitData
from recommenders_implicit import ISGD  # ISGD framework, BISGD,
from eval_implicit import EvaluateAndStore # EvaluateAndStore para guardar estados do modelo e holdouts, a avaliação prequencial de ratings implicitos é opcional, , EvalHoldout
from dataset_evaluation_utils import * #get_interactions_info, plot_interactions_per_month, plot_user_presence_distribution, plot_n_users_per_bucket, plot_bucket_size

from datetime import datetime, timedelta

import joblib
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

___
# Amazon Digital Music
Small subset, ratings only.  
https://nijianmo.github.io/amazon/index.html


## import dataset

In [None]:
dig_music = pd.read_csv("../datasets/Digital_Music.csv", names=['user_id', 'item_id', 'rating', 'timestamp'])
dataset_name = 'Amazon_Digital_Music'
user_col = 'user_id'

dig_music.shape

In [None]:
dig_music.head()

## keeping only users that have rated more than 5 times

In [None]:
dig_music['user_id'].value_counts() >= 5

In [None]:
k_core_users = dig_music['user_id'].value_counts()[ dig_music['user_id'].value_counts() >= 5 ].index
k_core_users

In [None]:
data = dig_music.set_index('user_id').loc[k_core_users].reset_index()
data.shape

## Keep only high ratings (the quantile 0.8 -> 5.0 ratings)

In [None]:
data['rating'].plot(kind='hist', title='Rates frequency');

In [None]:
pos_ratings = np.quantile(data['rating'], q=0.8)
pos_ratings

In [None]:
data = data[ data['rating'] == pos_ratings ].drop(columns='rating')
data.head()

In [None]:
# number of remaining cases
data.shape[0]

## Convert timestamp

In [None]:
%%time
data['date'] = data['timestamp'].apply(lambda x: datetime.fromtimestamp(x) ) # convert unix timestamp to date
data = data.sort_values(by='date') # sort by date

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
%%time
# keep year-month
data['date'] = data['date'].apply(lambda x: datetime.strptime( str(x.year)+'-'+str(x.month), '%Y-%m' ))

In [None]:
data.head()

---------------------------------------------------------------------------------------------
### Data.head() is showing different results from Klismam's...

In [None]:
data[data.timestamp==877305600]

-- example ---------------------------

In [None]:
import pytz
from datetime import datetime

In [None]:
pytz.all_timezones_set.intersection(['GMT'])

In [None]:
ts = 877305600

In [None]:
datetime.fromtimestamp(ts)

In [None]:
utc_dt = datetime.utcfromtimestamp(ts)
utc_dt

In [None]:
aware_utc_dt = utc_dt.replace(tzinfo=pytz.utc)

tz = pytz.timezone('GMT')
dt = aware_utc_dt.astimezone(tz)
dt

In [None]:
dt = datetime.fromtimestamp(ts, tz)
dt

-- end example ---------------------------

In [None]:
data['date_utc'] = data['timestamp'].apply(lambda x: datetime.utcfromtimestamp(x) ) # convert unix timestamp to date
data = data.sort_values(by='date_utc') # sort by date

In [None]:
data.head()

In [None]:
data.tail()

".fromtimestamp() might fail for past dates if a local timezone had different utc offset. You need a historic timezone database such as provided by pytz module (or your OS). Or just work in UTC and use .utcfromtimestamp()" - (https://stackoverflow.com/questions/3682748/converting-unix-timestamp-string-to-readable-date)

In [None]:
data = data.drop(columns=['date', 'date_utc'])
data['date'] = data['timestamp'].apply(lambda x: datetime.utcfromtimestamp(x) ) # convert unix timestamp to date
data = data.sort_values(by='date') # sort by date
data.head()

In [None]:
%%time
# keep year-month
data['date'] = data['date'].apply(lambda x: datetime.strptime( str(x.year)+'-'+str(x.month), '%Y-%m' ))

In [None]:
data.head()

now it's SOLVED.
because of the drop of date, it seems that the utc problem was not an issue, but now it's certain.
___

## Evaluating Time defined Buckets 

In [None]:
%%time
interactions_per_month = data.groupby(by=['date']).count().iloc[:, 0]
interactions_per_month.name = 'count'
interactions_per_month=interactions_per_month.reset_index()

### plot interactions_per_month

In [None]:
_ = interactions_per_month.copy()
_['date'] = _['date'].dt.year
_.groupby('date').sum().plot(kind='bar', 
                             title='interactions_per_month').set_xlabel('year');

### sample_year_month

In [None]:
sample_year_month = [('2014', '%Y'), ('2014-10', '%Y-%m')]

In [None]:
# sample_year_month[1] <= date < sample_year_month[1]
y_filter = (datetime.strptime(*sample_year_month[0]) <= data['date']) & (data['date'] < datetime.strptime(*sample_year_month[1]))

# 2014 <= date < 2014-04
# y_filter = (data['date'] < datetime.strptime('2014-04', '%Y-%m')) & (datetime.strptime('2014', '%Y') <= data['date']) 

# 2014 <= date < 2015
# y_filter = (data['date'] < datetime.strptime('2015', '%Y')) & (datetime.strptime('2014', '%Y') <= data['date']) 

### plot sample_year_month 

In [None]:
print(data[y_filter].shape[0])
plot_interactions_per_month(data=data[y_filter].reset_index(), dataset_name=dataset_name)

In [None]:
data = data[y_filter]

In [None]:
# number of remaining cases
data.shape[0]

## Sampling the dataset - only most frequent users (overall)

In [None]:
data.groupby(by=['user_id'])['date'].nunique()

In [None]:
%%time
user_freq_over_months = data.groupby(by=['user_id'])['date'].nunique()

### plot user_freq_over_months

In [None]:
user_freq_over_months.plot(kind='hist',
                           bins=user_freq_over_months.max(),
                           title='user_freq_over_months').set_xlabel('freq_over_months');

In [None]:
_ = data.set_index('user_id').copy()

In [None]:
# frequency of users that occur every month:
_.reset_index()['user_id'].value_counts().plot(kind='box',
                                               title='frequency of users that occur every month');

In [None]:
data['user_id'].value_counts()

In [None]:
# list of users
u = data['user_id'].value_counts().reset_index()['index'].to_list() 

In [None]:
print(data['user_id'].nunique())
print(len(u) == data['user_id'].nunique())

In [None]:
data['user_id'].shape[0]

In [None]:
p = (data['user_id'].value_counts()/data['user_id'].shape[0])
p

In [None]:
# sample size of 75% of the users
prop = 0.75
s = round( len(u)*prop )
s

### sampled_users

In [None]:
np.random.seed(42)
sampled_users = list( np.random.choice(u, size=s, replace=False, p=p) )

In [None]:
_.loc[sampled_users].shape

In [None]:
_.loc[sampled_users]['item_id'].nunique()

In [None]:
# frequency of users:
_.loc[sampled_users].reset_index()['user_id'].value_counts().plot(kind='box',
                                                                  title='frequency of *sampled* users');

In [None]:
%%time
user_freq_over_months_sample = _.loc[sampled_users].reset_index().groupby(by=['user_id'])['date'].nunique()
user_freq_over_months_sample.plot(kind='hist',
                                  bins=user_freq_over_months_sample.max(),
                                  title='user_freq_over_months_sample').set_xlabel('freq_over_months of *sampled* users');


In [None]:
# original number of users, original number of items
data['user_id'].nunique(), data['item_id'].nunique() 

In [None]:
# _.loc[freq_users].reset_index()['user_id'].nunique(), _.loc[freq_users].reset_index()['item_id'].nunique() # sample number of users, and of items
_.loc[sampled_users].reset_index()['user_id'].nunique(), _.loc[sampled_users].reset_index()['item_id'].nunique() # sample number of users, and of items

In [None]:
# %%time
# plot_interactions_per_month(data=_.loc[freq_users].reset_index(), dataset_name=dataset_name)
plot_interactions_per_month(data=_.loc[sampled_users].reset_index(), dataset_name=dataset_name)

In [None]:
USE_THRESHOLD = True
# all users 0.002736
# USING prop = 0.75 increased performance by a bit in the first month, 0.004292 (0.002878) ?
# USING prop = 0.5 reduced performance in the first month  	0.001647
# USING prop = 0.25 reduced performance in the first month 0.002574

if USE_THRESHOLD: 
    data = _.loc[sampled_users].reset_index()
    data = data.sort_values(by=['timestamp', 'user_id', 'item_id'])
    print(data.shape[0])
else:
    print(data.shape[0])


# new sample
sample_str = str(sample_year_month[0][0])+'_until_'+str(sample_year_month[1][0])
data.to_csv('output/amazon_digmusic_dump/sampled_amazon_digmusic'+sample_str+'.csv', index=False)

How many interactions occur in each month / year?

In [None]:
interactions_per_month = data.groupby(by=['date']).count().iloc[:, 0]
interactions_per_month.name = 'count'
interactions_per_month=interactions_per_month.reset_index()

In [None]:
%%time
plot_interactions_per_month(data=data, dataset_name=dataset_name)

What is the user presence distribution based on month, quarter, and semester?

In [None]:
%%time
# 3min 16s
user_presence_df, user_month_interactions = get_interactions_info(data=data, user_col=user_col)

# new sample
user_presence_df.to_csv('output/amazon_digmusic_dump/sample_user_presence_df.csv', index=False)
user_month_interactions.to_csv('output/amazon_digmusic_dump/sample_user_month_interactions.csv', index=False)

In [None]:
user_presence_df.head()

In [None]:
user_presence_df[user_presence_df.UserID=='B00HRN0BZ2']

In [None]:
# plotting user frequency distribution
plot_user_presence_distribution(user_presence_df, dataset_name=dataset_name)

How many examples are there per quarter and semester?

In [None]:
# plot_interactions_per_qns(user_interactions=user_trimestre_interactions, date_range=trimestres, dataset_name=dataset_name, type_of_range='quarter')

In [None]:
# plot_interactions_per_qns(user_interactions=user_semestre_interactions, date_range=semestres, dataset_name=dataset_name, type_of_range='semester')

How many FREQUENT users (occur in 80% or more intervals) exist?

In [None]:
# defining threshold
frequency_threshold = 1

In [None]:
# frequent_users_month, frequent_users_trimestre, frequent_users_semestre = get_frequent_users(user_presence_df, frequency_threshold=frequency_threshold)
frequent_users_month = get_frequent_users(user_presence_df, frequency_threshold=frequency_threshold)

In [None]:
# old huge sample
# joblib.dump(frequent_users_month, 'output/amazonbooks_dump/frequent_users_month.joblib')

# new sample
joblib.dump(frequent_users_month, 'output/amazon_digmusic_dump/sample_frequent_users_month.joblib')