Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Schedule calculations and show artist count on user page. #244

Merged
merged 20 commits into from
Oct 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
d999917
Refactor scheduler code to remove unneeded code
paramsingh Jul 3, 2017
c3404f3
Add function for getting recently logged in users
paramsingh Aug 2, 2017
cade177
Add stats calculation variable to config.py.sample
paramsingh Aug 3, 2017
6a4ff2d
Write code for calculation of user stats
paramsingh Aug 3, 2017
fe850d3
Make last_updated columns in stats tables NOT NULL
paramsingh Aug 3, 2017
3b4606c
Comment to think about
paramsingh Aug 3, 2017
8b6f3ec
Add code for inserting calculated stats into db
paramsingh Aug 3, 2017
0a9a3f4
First cut of adding jobs to scheduler
paramsingh Aug 3, 2017
1252824
Add a script that can be run to manually calculate stats
paramsingh Aug 8, 2017
f91488f
Show artist count on user page
paramsingh Aug 8, 2017
addea60
Remove extra line that came from rebase
paramsingh Aug 17, 2017
b9e6120
Add different stat function for getting artist_count
paramsingh Aug 17, 2017
7362d80
Set NULL values of last_updated in stat tables to 0
paramsingh Aug 22, 2017
5b6ff06
Show stats in a table on user page
paramsingh Aug 22, 2017
f6eacfd
Fix error if stats are not calculated for user
paramsingh Aug 22, 2017
3b9b043
Change indentation to spaces in profile.html
paramsingh Sep 1, 2017
4eb5b5c
Address TODO comments and add docstrings to new functions
paramsingh Sep 1, 2017
54e80b6
Don't align equals signs
paramsingh Oct 5, 2017
18de196
Change docstring to better english
paramsingh Oct 5, 2017
650d079
Put the bigquery initialization code into a module
paramsingh Oct 5, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions admin/sql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ CREATE TABLE statistics.user (
artists JSONB,
releases JSONB,
recordings JSONB,
last_updated TIMESTAMP WITH TIME ZONE
last_updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
);

CREATE TABLE statistics.artist (
Expand All @@ -60,7 +60,7 @@ CREATE TABLE statistics.artist (
recordings JSONB,
users JSONB,
listen_count JSONB,
last_updated TIMESTAMP WITH TIME ZONE
last_updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
);
ALTER TABLE statistics.artist ADD CONSTRAINT artist_stats_msid_uniq UNIQUE (msid);

Expand All @@ -71,7 +71,7 @@ CREATE TABLE statistics.release (
recordings JSONB,
users JSONB,
listen_count JSONB,
last_updated TIMESTAMP WITH TIME ZONE
last_updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
);
ALTER TABLE statistics.release ADD CONSTRAINT release_stats_msid_uniq UNIQUE (msid);

Expand All @@ -81,7 +81,7 @@ CREATE TABLE statistics.recording (
name VARCHAR,
users_all_time JSONB,
listen_count JSONB,
last_updated TIMESTAMP WITH TIME ZONE
last_updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()

);
ALTER TABLE statistics.recording ADD CONSTRAINT recording_stats_msid_uniq UNIQUE (msid);
Expand Down
21 changes: 21 additions & 0 deletions admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
BEGIN;

UPDATE statistics.user SET last_updated = to_timestamp(0) WHERE last_updated IS NULL;
UPDATE statistics.artist SET last_updated = to_timestamp(0) WHERE last_updated IS NULL;
UPDATE statistics.release SET last_updated = to_timestamp(0) WHERE last_updated IS NULL;
UPDATE statistics.recording SET last_updated = to_timestamp(0) WHERE last_updated IS NULL;

ALTER TABLE statistics.user ALTER COLUMN last_updated SET NOT NULL;
ALTER TABLE statistics.user ALTER COLUMN last_updated SET DEFAULT NOW();

ALTER TABLE statistics.artist ALTER COLUMN last_updated SET NOT NULL;
ALTER TABLE statistics.artist ALTER COLUMN last_updated SET DEFAULT NOW();


ALTER TABLE statistics.release ALTER COLUMN last_updated SET NOT NULL;
ALTER TABLE statistics.release ALTER COLUMN last_updated SET DEFAULT NOW();

ALTER TABLE statistics.recording ALTER COLUMN last_updated SET NOT NULL;
ALTER TABLE statistics.recording ALTER COLUMN last_updated SET DEFAULT NOW();

COMMIT;
18 changes: 6 additions & 12 deletions listenbrainz/bigquery-writer/bigquery-writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@

from googleapiclient import discovery
from googleapiclient.errors import HttpError
from listenbrainz.bigquery import create_bigquery_object
from listenbrainz.bigquery import NoCredentialsVariableException, NoCredentialsFileException
from oauth2client.client import GoogleCredentials

REPORT_FREQUENCY = 5000
APP_CREDENTIALS_FILE = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')
ERROR_RETRY_DELAY = 3 # number of seconds to wait until retrying an operation
DUMP_JSON_WITH_ERRORS = True

Expand Down Expand Up @@ -153,18 +154,11 @@ def start(self):
sleep(66666)
return

if not APP_CREDENTIALS_FILE:
self.log.error("BiqQueryWriter not started, the GOOGLE_APPLICATION_CREDENTIALS env var is not defined.")
try:
self.bigquery = create_bigquery_object()
except (NoCredentialsFileException, NoCredentialsVariableException):
self.log.error("Credential File not present or invalid! Sleeping...")
sleep(1000)
return

if not os.path.exists(APP_CREDENTIALS_FILE):
self.log.error("BiqQueryWriter not started, %s is missing." % APP_CREDENTIALS_FILE)
sleep(1000)
return

credentials = GoogleCredentials.get_application_default()
self.bigquery = discovery.build('bigquery', 'v2', credentials=credentials)

while True:
try:
Expand Down
33 changes: 33 additions & 0 deletions listenbrainz/bigquery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
from googleapiclient import discovery
import googleapiclient
from oauth2client.client import GoogleCredentials

APP_CREDENTIALS_FILE = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

def create_bigquery_object():
""" Initiates the connection to Google BigQuery. Returns a BigQuery object. """

if not APP_CREDENTIALS_FILE:
logger.error("The GOOGLE_APPLICATIONS_CREDENTIALS variable is undefined, cannot connect to BigQuery")
raise NoCredentialsVariableException

if not os.path.exists(APP_CREDENTIALS_FILE):
logger.error("The BigQuery credentials file does not exist, cannot connect to BigQuery")
raise NoCredentialsFileException

credentials = GoogleCredentials.get_application_default()
return discovery.build('bigquery', 'v2', credentials=credentials)


# Exceptions
class BigQueryException(Exception):
pass


class NoCredentialsVariableException(BigQueryException):
pass


class NoCredentialsFileException(BigQueryException):
pass
3 changes: 3 additions & 0 deletions listenbrainz/config.py.sample
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ BIGQUERY_TABLE_ID = "listen"

# Stats
STATS_ENTITY_LIMIT = 100 # the number of entities to calculate at max with BQ
STATS_CALCULATION_LOGIN_TIME = 30 # users must have logged in to LB in the past 30 days for stats to be calculated
STATS_CALCULATION_INTERVAL = 7 # stats are calculated every 7 days


# Max time in seconds after which the playing_now stream will expire.
PLAYING_NOW_MAX_DURATION = 10 * 60
Expand Down
73 changes: 73 additions & 0 deletions listenbrainz/db/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""This module contains functions to insert and retrieve statistics
calculated from Google BigQuery into the database.
"""

import sqlalchemy
import ujson
from listenbrainz import db


def insert_user_stats(user_id, artists, recordings, releases, artist_count):
"""Inserts user stats calculated from Google BigQuery into the database.

If stats are already present for some user, they are updated to the new
values passed.

Args: user_id (int): the row id of the user,
artists (dict): the top artists listened to by the user
recordings (dict): the top recordings listened to by the user
releases (dict): the top releases listened to by the user
artist_count (int): the total number of artists listened to by the user
"""

# put all artist stats into one dict which will then be inserted
# into the artist column of the stats.user table
artist_stats = {
'count': artist_count,
'all_time': artists
}

with db.engine.connect() as connection:
connection.execute(sqlalchemy.text("""
INSERT INTO statistics.user (user_id, artists, recordings, releases)
VALUES (:user_id, :artists, :recordings, :releases)
ON CONFLICT (user_id)
DO UPDATE SET artists = :artists,
recordings = :recordings,
releases = :releases,
last_updated = NOW()
"""), {
'user_id': user_id,
'artists': ujson.dumps(artist_stats),
'recordings': ujson.dumps(recordings),
'releases': ujson.dumps(releases)
}
)


def get_user_stats(user_id):
"""Get user stats for user with given ID.

Args: user_id (int): the row ID of the user in the DB

Returns: A dict of the following format
{
"user_id" (int): the id of the user
"artists" (dict): artist stats for the user
"releases" (dict) : release stats for the user
"recordings" (dict): recording stats for the user
"last_updated" (datetime): timestamp when the stats were last updated
}
"""

with db.engine.connect() as connection:
result = connection.execute(sqlalchemy.text("""
SELECT user_id, artists, releases, recordings, last_updated
FROM statistics.user
WHERE user_id = :user_id
"""), {
'user_id': user_id
}
)
row = result.fetchone()
return dict(row) if row else None
44 changes: 44 additions & 0 deletions listenbrainz/db/tests/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
import json
import os
import listenbrainz.db.user as db_user
import listenbrainz.db.stats as db_stats
from listenbrainz.db.testing import DatabaseTestCase


class StatsDatabaseTestCase(DatabaseTestCase):

TEST_DATA_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'testdata')

def setUp(self):
DatabaseTestCase.setUp(self)
self.user = db_user.get_or_create('stats_user')

def path_to_data_file(self, filename):
return os.path.join(StatsDatabaseTestCase.TEST_DATA_PATH, filename)

def test_insert_user_stats(self):

with open(self.path_to_data_file('user_top_artists.json')) as f:
artists = json.load(f)
with open(self.path_to_data_file('user_top_releases.json')) as f:
releases = json.load(f)
with open(self.path_to_data_file('user_top_recordings.json')) as f:
recordings = json.load(f)


db_stats.insert_user_stats(
user_id=self.user['id'],
artists=artists,
recordings=recordings,
releases=releases,
artist_count=2,
)

result = db_stats.get_user_stats(user_id=self.user['id'])
self.assertDictEqual(result['artists']['all_time'], artists)
self.assertEqual(result['artists']['count'], 2)
self.assertDictEqual(result['releases'], releases)
self.assertDictEqual(result['recordings'], recordings)
self.assertGreater(int(result['last_updated'].strftime('%s')), 0)

38 changes: 34 additions & 4 deletions listenbrainz/db/tests/test_user.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# -*- coding: utf-8 -*-
from listenbrainz.db.testing import DatabaseTestCase
import listenbrainz.db.user as db_user
from listenbrainz import db
import time
import sqlalchemy

from listenbrainz import db
from listenbrainz.db.testing import DatabaseTestCase
import listenbrainz.db.user as db_user

class UserTestCase(DatabaseTestCase):

Expand Down Expand Up @@ -59,3 +58,34 @@ def test_update_latest_import(self):
db_user.update_latest_import(user['musicbrainz_id'], val)
user = db_user.get_by_mb_id(user['musicbrainz_id'])
self.assertEqual(val, int(user['latest_import'].strftime('%s')))

def test_get_recently_logged_in_users(self):
"""Tests getting recently logged in users"""

# create two users, set one's last_login
# to a very old value and one's last_login
# to now and then call get_recently_logged_in_users
user1 = db_user.get_or_create('recentuser1')
with db.engine.connect() as connection:
connection.execute(sqlalchemy.text("""
UPDATE "user"
SET last_login = to_timestamp(0)
WHERE musicbrainz_id = :musicbrainz_id
"""), {
'musicbrainz_id': 'recentuser1'
})

user2 = db_user.get_or_create('recentuser2')
with db.engine.connect() as connection:
connection.execute(sqlalchemy.text("""
UPDATE "user"
SET last_login = NOW()
WHERE musicbrainz_id = :musicbrainz_id
"""), {
'musicbrainz_id': 'recentuser2'
})

recent_users = db_user.get_recently_logged_in_users()
self.assertEqual(len(recent_users), 1)
self.assertEqual(recent_users[0]['musicbrainz_id'], 'recentuser2')

21 changes: 18 additions & 3 deletions listenbrainz/db/user.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@

from listenbrainz import db
import uuid
import sqlalchemy
from listenbrainz.db.exceptions import DatabaseException
import logging
import time
from listenbrainz import db
from listenbrainz.db.exceptions import DatabaseException
from listenbrainz import config

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
Expand Down Expand Up @@ -213,3 +213,18 @@ def update_latest_import(musicbrainz_id, ts):
except sqlalchemy.exc.ProgrammingError as e:
logger.error(e)
raise DatabaseException


def get_recently_logged_in_users():
"""Returns a list of users who have logged-in in the
last config.STATS_CALCULATION_LOGIN_TIME days
"""
with db.engine.connect() as connection:
result = connection.execute(sqlalchemy.text("""
SELECT {columns}
FROM "user"
WHERE last_login >= NOW() - INTERVAL ':x days'
""".format(columns=','.join(USER_GET_COLUMNS))), {
'x': config.STATS_CALCULATION_LOGIN_TIME
})
return [dict(row) for row in result]
18 changes: 2 additions & 16 deletions listenbrainz/stats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from googleapiclient import discovery
import googleapiclient
from oauth2client.client import GoogleCredentials
import os
import logging
from listenbrainz.stats.exceptions import NoCredentialsVariableException, NoCredentialsFileException
from listenbrainz.bigquery import create_bigquery_object
import listenbrainz.config as config
import time

Expand All @@ -12,25 +9,14 @@
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

APP_CREDENTIALS_FILE = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')

bigquery = None


def init_bigquery_connection():
""" Initiates the connection to Google BigQuery """

if not APP_CREDENTIALS_FILE:
logger.error("The GOOGLE_APPLICATIONS_CREDENTIALS variable is undefined, cannot connect to BigQuery")
raise NoCredentialsVariableException

if not os.path.exists(APP_CREDENTIALS_FILE):
logger.error("The BigQuery credentials file does not exist, cannot connect to BigQuery")
raise NoCredentialsFileException

global bigquery
credentials = GoogleCredentials.get_application_default()
bigquery = discovery.build('bigquery', 'v2', credentials=credentials)
bigquery = create_bigquery_object()


def get_parameters_dict(parameters):
Expand Down