In [56]:
import logging
import os
import pickle

from celery import Celery
from dotenv import load_dotenv
import pandas as pd
from sqlalchemy import create_engine

from recommender.read_db import ReadDatabase
from recommender.models import CollaborativeFiltering
from recommender.preprocess import SetUpDataframes
from recommender.datasets import Datasets

In [57]:
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

load_dotenv('.env')

# environment variables
DB_URI = os.environ.get("DB_URI")

In [58]:
# set up database reader
engine = create_engine(DB_URI)
reader_cm = ReadDatabase(engine, 'CM_BD')

In [59]:
def refresh_data(filename):
    question_df = reader_cm.get_data(
        'id, post_id',
        'posts_question',
        None)

    taxonomy_df = reader_cm.get_data(
        'post_id, area_id',
        'posts_taxonomy',
        None)

    content_df = reader_cm.get_data(
        'id, min_range, max_range',
        'posts_post',
        None)

    interaction_df = reader_cm.get_data(
        'user_id, post_id',
        'posts_interaction',
        "type IN ('sended', 'sent')")
    interaction_df = interaction_df[~interaction_df['post_id'].isna()]
    interaction_df['post_id'] = interaction_df['post_id'].astype('int32')

    response_df = reader_cm.get_data(
        'user_id, response, question_id',
        'posts_response',
        "created_at >= '2019-09-20'",
        None)
    response_df = response_df[
        (response_df['response'].apply(lambda x: x.isdigit())) & (response_df['response'] != '0')]
    response_df = response_df.drop_duplicates().reset_index(drop=True)

    fresh_data = {
        'question_df': question_df.to_json(),
        'taxonomy_df': taxonomy_df.to_json(),
        'content_df': content_df.to_json(),
        'interaction_df': interaction_df.to_json(),
        'response_df': response_df.to_json()
    }

    with open(f'{filename}.pkl', 'wb') as f:
        pickle.dump(fresh_data, f)

In [60]:
def recommend(user_id, months, data_required):
    model = CollaborativeFiltering()

    model.load_model('afinidata_recommender_model_specs')

    return model.afinidata_recommend(user_id=user_id, months=months, data_required=data_required)

In [61]:
def register_recommendation_response(user_id, response):
    recommendation_df = pd.read_json(recommend(user_id, 10, fresh_data))
    post_id = recommendation_df.iloc[0]['post_id']
    question_id = recommendation_df.iloc[0]['question_id']
    print(recommendation_df.iloc[0])
    
    new_response = pd.DataFrame([[user_id, response, question_id]], columns=['user_id', 'response', 'question_id'])
    response_df = pd.read_json(fresh_data['response_df']).append(new_response, ignore_index=True)
    fresh_data['response_df'] = response_df.to_json()

    new_interaction = pd.DataFrame([[user_id, post_id]], columns=['user_id', 'post_id'])
    interaction_df = pd.read_json(fresh_data['interaction_df']).append(new_interaction, ignore_index=True)
    fresh_data['interaction_df'] = interaction_df.to_json()

In [62]:
refresh_data('afinidata_fresh_data')

----------------------------------------------------------------------
reading columns id, post_id from table posts_question from database CM_BD
----------------------------------------------------------------------
reading columns post_id, area_id from table posts_taxonomy from database CM_BD
----------------------------------------------------------------------
reading columns id, min_range, max_range from table posts_post from database CM_BD
----------------------------------------------------------------------
reading columns user_id, post_id from table posts_interaction from database CM_BD
----------------------------------------------------------------------
reading columns user_id, response, question_id from table posts_response from database CM_BD


In [63]:
with open(f'afinidata_fresh_data.pkl', 'rb') as f:
    fresh_data = pickle.load(f)

In [64]:
# extract data from posts_response into a pandas dataframe and
# slightly process only relevant data for training
# in this case, so far we are only considering data for which
# there is an alpha value in the 'response' column
response_df = pd.read_json(fresh_data['response_df'])

print('*' * 80)
print(f'total number of responses in response_df: {len(response_df)}')

# create matrix for training with items over rows and users over columns
# as a numpy matrix
response_matrix = SetUpDataframes.response_matrix(response_df)

********************************************************************************
total number of responses in response_df: 6536


In [65]:
response_matrix.loc[:,50].values

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [66]:
epochs=12000
lr=0.00001
alpha=0.
depth=2

In [None]:
# train test split
datasets = Datasets(response_matrix)
train_set, test_set = datasets.train_test_split(0.1)

# model initialization
model = CollaborativeFiltering()
model.actors = {
    'users': response_matrix.columns.values,
    'items': response_matrix.index.values
}
model.n_items = len(datasets.posts)
model.n_users = len(datasets.users)

model.train(
    train_matrix=train_set,
    test_matrix=test_set,
    epochs=epochs,
    alpha=alpha,
    n_features=depth,
    lr=lr,
    resume=False
)

print('*' * 80)
model.save_model(f'afinidata_recommender_model_specs')
print(f'model has been saved to afinidata_recommender_model_specs.pkl in the local directory')

********************************************************************************
training recommendation model for 12000 epochs with learning rate 1e-05 and 
hyperparameters regularization: 0.0 / latent features: 2
********************************************************************************
Epoch 00001 / train loss 4.733221 / test loss 4.759761
Epoch 00101 / train loss 0.628564 / test loss 0.583090
Epoch 00201 / train loss 0.618718 / test loss 0.574798
Epoch 00301 / train loss 0.609623 / test loss 0.567358
Epoch 00401 / train loss 0.601189 / test loss 0.560505
Epoch 00501 / train loss 0.593351 / test loss 0.554186
Epoch 00601 / train loss 0.586056 / test loss 0.548351
Epoch 00701 / train loss 0.579253 / test loss 0.542956
Epoch 00801 / train loss 0.572896 / test loss 0.537961
Epoch 00901 / train loss 0.566947 / test loss 0.533330
Epoch 01001 / train loss 0.561367 / test loss 0.529031
Epoch 01101 / train loss 0.556125 / test loss 0.525034
Epoch 01201 / train loss 0.551191 / test los

In [None]:
pd.read_json(recommend(50, 10, fresh_data))

In [46]:
register_recommendation_response(50, 10)

         predictions  response     score  normalized  probabilities
area_id                                                            
cogni       2.909244  7.100000  7.100000    1.141661       0.082128
motor       3.194701  6.500000  6.500000   -0.698629       0.517270
socio       3.431206  6.583333  6.583333   -0.443033       0.400602
predictions    3.61716
question_id        323
post_id            336
area_id          socio
response             1
Name: 182, dtype: object
