In [1]:
import pandas as pd
import numpy as np
import datetime as dt

## Carregando a Base

In [2]:
events = pd.read_csv('Data\Raw\events.csv')

events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


## Dummies eventos

In [3]:
events = pd.get_dummies(events)
events.rename(columns = {'visitorid':'userid', 
                         'event_addtocart':'cart', 
                         'event_transaction': 'sale', 
                         'event_view': 'view'}, 
              inplace = True)

events = events.sort_values(by='timestamp')
events = events.reset_index()
events.drop(columns=['transactionid', 'index'], axis='columns', inplace=True)

events.head()

Unnamed: 0,timestamp,userid,itemid,cart,sale,view
0,1430622004384,693516,297662,1,0,0
1,1430622011289,829044,60987,0,0,1
2,1430622013048,652699,252860,0,0,1
3,1430622024154,1125936,33661,0,0,1
4,1430622026228,693516,297662,0,0,1


In [4]:
print(events.shape)

(2756101, 6)


In [5]:
events.to_csv('Data/events.csv', index=False)

## Divisão da Base em Treinamento e Teste

In [6]:
ts = list(pd.to_datetime(events['timestamp'], unit='ms'))

print('Data Primeiro Evento: \n' + str(ts[0]))
print('\nData Último Evento: \n' + str(ts[-1]))

Data Primeiro Evento: 
2015-05-03 03:00:04.384000

Data Último Evento: 
2015-09-18 02:59:47.788000


Para a divisão das bases foi determinado que os eventos registrados no último mês, entre os dias 18/08/2015 e 18/09/2015, será destinado a testes, sendo os demais para treinamento.

In [7]:
limite = dt.datetime.strptime('2015-08-18 00:00:00.000000', '%Y-%m-%d %H:%M:%S.%f').timestamp() * 1000

df_train = events[events['timestamp'] < limite]
df_test = events[events['timestamp'] >= limite]

print(df_train.shape)
print(df_test.shape)

(2203328, 6)
(552773, 6)


## Cálculo Rating Treinamento

In [8]:
train_gb = df_train.groupby(['userid','itemid'], as_index=False).sum()
train_rating_binario = []
train_rating_pesos = []
train_rating_categorias = []

for i, row in train_gb.iterrows():
    b = 0
    p = 0
    c = 0
    
    if row['sale'] > 0:
        b = 1
        p = 1
        c = 3        
    elif row['cart'] > 0:
        p = 0.9
        c = 2
    elif row['view'] > 0:
        b = 0
        p = 0.75
        c = 1
        
    train_rating_binario.append(b)
    train_rating_pesos.append(p)
    train_rating_categorias.append(c)

In [9]:
train_binario = train_gb.copy()
train_binario['rating'] = train_rating_binario
train_binario.drop(['timestamp'], axis = 'columns', inplace = True)
train_binario.to_csv('Data/train_binario.csv', index=False)

In [10]:
train_pesos = train_gb.copy()
train_pesos['rating'] = train_rating_pesos
train_pesos.drop(['timestamp'], axis = 'columns', inplace = True)
train_pesos.to_csv('Data/train_pesos.csv', index=False)

In [11]:
train_categorias = train_gb.copy()
train_categorias['rating'] = train_rating_categorias
train_categorias.drop(['timestamp'], axis = 'columns', inplace = True)
train_categorias.to_csv('Data/train_categorias.csv', index=False)

## Cálculo Rating Teste

In [12]:
test_gb = df_test.groupby(['userid','itemid'], as_index=False).sum()
test_rating_binario = []
test_rating_pesos = []
test_rating_categorias = []

for i, row in test_gb.iterrows():
    b = 0
    p = 0
    c = 0
    
    if row['sale'] > 0:
        b = 1
        p = 1
        c = 3        
    elif row['cart'] > 0:
        p = 0.9
        c = 2
    elif row['view'] > 0:
        b = 0
        p = 0.75
        c = 1
        
    test_rating_binario.append(b)
    test_rating_pesos.append(p)
    test_rating_categorias.append(c)

In [13]:
test_binario = test_gb.copy()
test_binario['rating'] = test_rating_binario
test_binario.drop(['timestamp'], axis = 'columns', inplace = True)
test_binario.to_csv('Data/test_binario.csv', index=False)

In [14]:
test_pesos = test_gb.copy()
test_pesos['rating'] = test_rating_pesos
test_pesos.drop(['timestamp'], axis = 'columns', inplace = True)
test_pesos.to_csv('Data/test_pesos.csv', index=False)

In [15]:
test_categorias = test_gb.copy()
test_categorias['rating'] = test_rating_categorias
test_categorias.drop(['timestamp'], axis = 'columns', inplace = True)
test_categorias.to_csv('Data/test_categorias.csv', index=False)