In [85]:
import logging
import os
import pickle

from dotenv import load_dotenv
import pandas as pd
from sqlalchemy import create_engine

from recommender.read_db import ReadDatabase
from recommender.models import CollaborativeFiltering
from recommender.preprocess import SetUpDataframes
from recommender.datasets import Datasets

In [86]:
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

load_dotenv('.env')

# environment variables
DB_URI = os.environ.get("DB_URI")

In [87]:
# set up database reader
engine = create_engine(DB_URI)
reader_cm = ReadDatabase(engine, 'CM_BD')

In [88]:
def refresh_data(filename):
    question_df = reader_cm.get_data(
        'id, post_id',
        'posts_question',
        None)

    taxonomy_df = reader_cm.get_data(
        'post_id, area_id',
        'posts_taxonomy',
        None)

    content_df = reader_cm.get_data(
        'id, min_range, max_range',
        'posts_post',
        "status IN ('published')")

    interaction_df = reader_cm.get_data(
        'user_id, post_id',
        'posts_interaction',
        "type IN ('sended', 'sent', 'dispatched')")
    interaction_df = interaction_df[~interaction_df['post_id'].isna()]
    interaction_df['post_id'] = interaction_df['post_id'].astype('int32')

    response_df = reader_cm.get_data(
        'user_id, response, question_id',
        'posts_response',
        "created_at >= '2019-09-20'",
        None)
    response_df = response_df[
        (response_df['response'].apply(lambda x: x.isdigit())) & (response_df['response'] != '0')]
    response_df = response_df.drop_duplicates().reset_index(drop=True)

    fresh_data = {
        'question_df': question_df.to_json(),
        'taxonomy_df': taxonomy_df.to_json(),
        'content_df': content_df.to_json(),
        'interaction_df': interaction_df.to_json(),
        'response_df': response_df.to_json()
    }

    with open(f'{filename}.pkl', 'wb') as f:
        pickle.dump(fresh_data, f)

In [89]:
def recommend(user_id, months, data_required):
    model = CollaborativeFiltering()

    model.load_model('afinidata_recommender_model_specs')

    return model.afinidata_recommend(user_id=user_id, months=months, data_required=data_required)

In [90]:
def register_recommendation_response(user_id, response):
    recommendation_df = pd.read_json(recommend(user_id, 0, fresh_data))
    post_id = recommendation_df.iloc[0]['post_id']
    question_id = recommendation_df.iloc[0]['question_id']
    print(recommendation_df.iloc[0])
    
    new_response = pd.DataFrame([[user_id, response, question_id]], columns=['user_id', 'response', 'question_id'])
    response_df = pd.read_json(fresh_data['response_df']).append(new_response, ignore_index=True)
    fresh_data['response_df'] = response_df.to_json()

    new_interaction = pd.DataFrame([[user_id, post_id]], columns=['user_id', 'post_id'])
    interaction_df = pd.read_json(fresh_data['interaction_df']).append(new_interaction, ignore_index=True)
    fresh_data['interaction_df'] = interaction_df.to_json()

In [91]:
refresh_data('afinidata_fresh_data')

----------------------------------------------------------------------
reading columns id, post_id from table posts_question from database CM_BD
----------------------------------------------------------------------
reading columns post_id, area_id from table posts_taxonomy from database CM_BD
----------------------------------------------------------------------
reading columns id, min_range, max_range from table posts_post from database CM_BD
----------------------------------------------------------------------
reading columns user_id, post_id from table posts_interaction from database CM_BD
----------------------------------------------------------------------
reading columns user_id, response, question_id from table posts_response from database CM_BD


In [92]:
with open(f'afinidata_fresh_data.pkl', 'rb') as f:
    fresh_data = pickle.load(f)

In [93]:
# extract data from posts_response into a pandas dataframe and
# slightly process only relevant data for training
# in this case, so far we are only considering data for which
# there is an alpha value in the 'response' column
response_df = pd.read_json(fresh_data['response_df'])

print('*' * 80)
print(f'total number of responses in response_df: {len(response_df)}')

# create matrix for training with items over rows and users over columns
# as a numpy matrix
response_matrix = SetUpDataframes.response_matrix(response_df)

********************************************************************************
total number of responses in response_df: 6547


In [94]:
response_matrix

user_id,5,43,50,56,62,141,152,162,177,255,...,40611,40616,40628,40640,40658,40698,40704,40705,40717,40776
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,,,,,,,,,,,...,,,,,,,,,,
446,,,,,,,,,,,...,,,,,,,,,,
447,,,,,,,,,,,...,,,,,,,,,,
448,,,,,,,,,,,...,,,,,,,,,,


In [95]:
response_matrix.loc[:,40776].values

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, na

In [52]:
epochs=12000
lr=0.00001
alpha=0.
depth=2

In [53]:
# train test split
datasets = Datasets(response_matrix)
train_set, test_set = datasets.train_test_split(0.1)

# model initialization
model = CollaborativeFiltering()
model.actors = {
    'users': response_matrix.columns.values,
    'items': response_matrix.index.values
}
model.n_items = len(datasets.posts)
model.n_users = len(datasets.users)

model.train(
    train_matrix=train_set,
    test_matrix=test_set,
    epochs=epochs,
    alpha=alpha,
    n_features=depth,
    lr=lr,
    resume=False
)

print('*' * 80)
model.save_model(f'afinidata_recommender_model_specs')
print(f'model has been saved to afinidata_recommender_model_specs.pkl in the local directory')

********************************************************************************
training recommendation model for 12000 epochs with learning rate 1e-05 and 
hyperparameters regularization: 0.0 / latent features: 2
********************************************************************************
Epoch 00001 / train loss 4.707532 / test loss 5.018488
Epoch 00101 / train loss 0.624641 / test loss 0.618706
Epoch 00201 / train loss 0.614707 / test loss 0.610152
Epoch 00301 / train loss 0.605526 / test loss 0.603035
Epoch 00401 / train loss 0.597007 / test loss 0.596514
Epoch 00501 / train loss 0.589088 / test loss 0.590530
Epoch 00601 / train loss 0.581713 / test loss 0.585033
Epoch 00701 / train loss 0.574833 / test loss 0.579977
Epoch 00801 / train loss 0.568402 / test loss 0.575322
Epoch 00901 / train loss 0.562380 / test loss 0.571030
Epoch 01001 / train loss 0.556732 / test loss 0.567069
Epoch 01101 / train loss 0.551423 / test loss 0.563409
Epoch 01201 / train loss 0.546425 / test los

In [270]:
pd.read_json(recommend(40776, 0, fresh_data))

         predictions  response     score  normalized  probabilities
area_id                                                            
cogni       2.989733  1.593750  1.593750   -1.117530       0.721700
motor       3.088059  3.294118  3.294118    0.805917       0.105442
socio       3.076035  2.857143  2.857143    0.311613       0.172857


Unnamed: 0,predictions,question_id,post_id,area_id,response
135,3.629393,266,279,cogni,4.0
134,3.571731,265,278,cogni,4.0
157,3.488403,291,304,cogni,3.0
153,3.468605,287,300,cogni,3.0
152,3.433095,286,299,cogni,
156,3.418308,290,303,cogni,4.0
218,3.278352,366,380,cogni,
14,3.273778,30,15,cogni,4.0
184,3.252049,325,338,cogni,4.0
170,3.236485,309,322,cogni,1.0


In [269]:
register_recommendation_response(40776, 1)

         predictions  response     score  normalized  probabilities
area_id                                                            
cogni       2.989733  1.612903  1.612903   -1.116614            1.0
predictions    2.22795
question_id        373
post_id            387
area_id          cogni
response           NaN
Name: 224, dtype: object
