From d9999172a83bad33741270eda8639147cccf9506 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Mon, 3 Jul 2017 18:01:03 +0530 Subject: [PATCH 01/20] Refactor scheduler code to remove unneeded code Also add some docstrings. --- listenbrainz/webserver/scheduler.py | 69 +++++------------------------ 1 file changed, 12 insertions(+), 57 deletions(-) diff --git a/listenbrainz/webserver/scheduler.py b/listenbrainz/webserver/scheduler.py index 349e6b43ed..0f78333657 100644 --- a/listenbrainz/webserver/scheduler.py +++ b/listenbrainz/webserver/scheduler.py @@ -1,74 +1,29 @@ -from sqlalchemy import create_engine, text -from sqlalchemy.pool import NullPool -import sqlalchemy.exc -from apscheduler.schedulers.background import BackgroundScheduler +""" This module contains code that triggers jobs we want to run + on regular intervals. +""" import logging +from apscheduler.schedulers.background import BackgroundScheduler -# Create a cron job to clean postgres database class ScheduledJobs(): """ Schedule the scripts that need to be run at scheduled intervals """ def __init__(self, conf): + self.log = logging.getLogger(__name__) + logging.basicConfig() + self.log.setLevel(logging.INFO) + self.conf = conf self.scheduler = BackgroundScheduler() self.add_jobs() self.run() def run(self): + """ Start the scheduler but stop on KeyboardInterrupts and such""" try: self.scheduler.start() except (KeyboardInterrupt, SystemExit): - self.shutdown() + self.scheduler.shutdown() def add_jobs(self): - args = {} - if 'MAX_POSTGRES_LISTEN_HISTORY' in self.conf: - args['max_days'] = int(self.conf['MAX_POSTGRES_LISTEN_HISTORY']) - - self.scheduler.add_job(self._clean_postgres, 'interval', hours=24, \ - kwargs=args) - - def _clean_postgres(self, max_days=90): - """ Clean all the listens that are older than a set no of days - Default: 90 days - """ - - # If max days is set to a negative number, don't throw anything out - if max_days < 0: - return - - seconds = max_days*24*3600 - engine = create_engine(self.conf['SQLALCHEMY_DATABASE_URI'], poolclass=NullPool) - connection = engine.connect() - # query = """ - # WITH max_table as ( - # SELECT user_id, max(extract(epoch from ts)) - %s as mx - # FROM listens - # GROUP BY user_id - # ) - # DELETE FROM listens - # WHERE extract(epoch from ts) < (SELECT mx - # FROM max_table - # WHERE max_table.user_id = listens.user_id) - # RETURNING * - # """ - - query = """ - DELETE FROM listen - WHERE id in ( - SELECT id FROM listen - JOIN ( - SELECT user_id, extract(epoch from max(ts)) as max - FROM listens - GROUP BY user_id - ) max_table on listens.user_id = max_table.user_id AND extract(epoch from listens.ts) <= max_table.max - %s - ) RETURNING * - """ - - deleted = connection.execute(query % (seconds)) - log = logging.getLogger(__name__) - log.info('(Scheduled Job) CleanPostgres: ' + str(len(deleted.fetchall())) + " records deleted successfully") - connection.close() - - def shutdown(self): - self.scheduler.shutdown() + """ Add the jobs that need to be run to the scheduler """ + pass From c3404f3b65e738d23070acd968c8b6f8b622c54d Mon Sep 17 00:00:00 2001 From: Param Singh Date: Wed, 2 Aug 2017 22:25:04 +0530 Subject: [PATCH 02/20] Add function for getting recently logged in users --- listenbrainz/db/tests/test_user.py | 38 ++++++++++++++++++++++++++---- listenbrainz/db/user.py | 22 ++++++++++++++--- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/listenbrainz/db/tests/test_user.py b/listenbrainz/db/tests/test_user.py index 0664d71a2c..df295e1678 100644 --- a/listenbrainz/db/tests/test_user.py +++ b/listenbrainz/db/tests/test_user.py @@ -1,10 +1,9 @@ # -*- coding: utf-8 -*- -from listenbrainz.db.testing import DatabaseTestCase -import listenbrainz.db.user as db_user -from listenbrainz import db import time import sqlalchemy - +from listenbrainz import db +from listenbrainz.db.testing import DatabaseTestCase +import listenbrainz.db.user as db_user class UserTestCase(DatabaseTestCase): @@ -59,3 +58,34 @@ def test_update_latest_import(self): db_user.update_latest_import(user['musicbrainz_id'], val) user = db_user.get_by_mb_id(user['musicbrainz_id']) self.assertEqual(val, int(user['latest_import'].strftime('%s'))) + + def test_get_recently_logged_in_users(self): + """Tests getting recently logged in users""" + + # create two users, set one's last_login + # to a very old value and one's last_login + # to now and then call get_recently_logged_in_users + user1 = db_user.get_or_create('recentuser1') + with db.engine.connect() as connection: + connection.execute(sqlalchemy.text(""" + UPDATE "user" + SET last_login = to_timestamp(0) + WHERE musicbrainz_id = :musicbrainz_id + """), { + 'musicbrainz_id': 'recentuser1' + }) + + user2 = db_user.get_or_create('recentuser2') + with db.engine.connect() as connection: + connection.execute(sqlalchemy.text(""" + UPDATE "user" + SET last_login = NOW() + WHERE musicbrainz_id = :musicbrainz_id + """), { + 'musicbrainz_id': 'recentuser2' + }) + + recent_users = db_user.get_recently_logged_in_users() + self.assertEqual(len(recent_users), 1) + self.assertEqual(recent_users[0]['musicbrainz_id'], 'recentuser2') + diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py index 4648e8c3fa..57cfd5838b 100644 --- a/listenbrainz/db/user.py +++ b/listenbrainz/db/user.py @@ -1,10 +1,10 @@ - -from listenbrainz import db import uuid import sqlalchemy -from listenbrainz.db.exceptions import DatabaseException import logging import time +from listenbrainz import db +from listenbrainz.db.exceptions import DatabaseException +from listenbrainz import config logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -213,3 +213,19 @@ def update_latest_import(musicbrainz_id, ts): except sqlalchemy.exc.ProgrammingError as e: logger.error(e) raise DatabaseException + + +def get_recently_logged_in_users(): + """Returns a list of users who have logged-in in the last X days, X is + defined in the config. + """ + with db.engine.connect() as connection: + result = connection.execute(sqlalchemy.text(""" + SELECT {columns} + FROM "user" + WHERE last_login >= NOW() - INTERVAL ':x days' + """.format(columns=','.join(USER_GET_COLUMNS))), { + #TODO(param): think about the name of this variable in config + 'x': config.STATS_CALCULATE_LOGIN + }) + return [dict(row) for row in result] From cade177f172aa8c524ed9ab72255ee5463972195 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Thu, 3 Aug 2017 22:56:42 +0530 Subject: [PATCH 03/20] Add stats calculation variable to config.py.sample --- listenbrainz/config.py.sample | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/listenbrainz/config.py.sample b/listenbrainz/config.py.sample index c3391ca29a..dc7705a101 100644 --- a/listenbrainz/config.py.sample +++ b/listenbrainz/config.py.sample @@ -55,6 +55,10 @@ BIGQUERY_TABLE_ID = "listen" # Stats STATS_ENTITY_LIMIT = 100 # the number of entities to calculate at max with BQ +# Stats +STATS_CALCULATE_LOGIN = 7 + + # Max time in seconds after which the playing_now stream will expire. PLAYING_NOW_MAX_DURATION = 10 * 60 From 6a4ff2d03a9173c231c630e67dd3ef58ecc1245f Mon Sep 17 00:00:00 2001 From: Param Singh Date: Thu, 3 Aug 2017 23:36:30 +0530 Subject: [PATCH 04/20] Write code for calculation of user stats --- listenbrainz/stats/calculate.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 listenbrainz/stats/calculate.py diff --git a/listenbrainz/stats/calculate.py b/listenbrainz/stats/calculate.py new file mode 100644 index 0000000000..c48ed0e9c2 --- /dev/null +++ b/listenbrainz/stats/calculate.py @@ -0,0 +1,21 @@ +import listenbrainz.stats.user as stats_user +import listenbrainz.db.user as db_user +import listenbrainz.db.stats as db_stats + + +def calculate_user_stats(): + for user in get_recently_logged_in_users(): + recordings = stats_user.get_top_recordings(musicbrainz_id=user['musicbrainz_id']) + artists = stats_user.get_top_artists(musicbrainz_id=user['musicbrainz_id']) + releases = stats_user.get_top_releases(musicbrainz_id=user['musicbrainz_id']) + + db_stats.insert_user_stats( + user_id=user['id'], + artists=artists, + recordings=recordings, + releases=releases, + ) + +def calculate(): + pass + From fe850d30f76c194d991db7d6c2f17fd13eb912c6 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Thu, 3 Aug 2017 23:42:35 +0530 Subject: [PATCH 05/20] Make last_updated columns in stats tables NOT NULL --- admin/sql/create_tables.sql | 8 ++++---- .../2017-08-03-make-stats-updated-not-null.sql | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) create mode 100644 admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql diff --git a/admin/sql/create_tables.sql b/admin/sql/create_tables.sql index 70efd3e207..4e318efaaa 100644 --- a/admin/sql/create_tables.sql +++ b/admin/sql/create_tables.sql @@ -49,7 +49,7 @@ CREATE TABLE statistics.user ( artists JSONB, releases JSONB, recordings JSONB, - last_updated TIMESTAMP WITH TIME ZONE + last_updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); CREATE TABLE statistics.artist ( @@ -60,7 +60,7 @@ CREATE TABLE statistics.artist ( recordings JSONB, users JSONB, listen_count JSONB, - last_updated TIMESTAMP WITH TIME ZONE + last_updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); ALTER TABLE statistics.artist ADD CONSTRAINT artist_stats_msid_uniq UNIQUE (msid); @@ -71,7 +71,7 @@ CREATE TABLE statistics.release ( recordings JSONB, users JSONB, listen_count JSONB, - last_updated TIMESTAMP WITH TIME ZONE + last_updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); ALTER TABLE statistics.release ADD CONSTRAINT release_stats_msid_uniq UNIQUE (msid); @@ -81,7 +81,7 @@ CREATE TABLE statistics.recording ( name VARCHAR, users_all_time JSONB, listen_count JSONB, - last_updated TIMESTAMP WITH TIME ZONE + last_updated TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() ); ALTER TABLE statistics.recording ADD CONSTRAINT recording_stats_msid_uniq UNIQUE (msid); diff --git a/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql b/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql new file mode 100644 index 0000000000..3a345bdba4 --- /dev/null +++ b/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql @@ -0,0 +1,16 @@ +BEGIN; + +ALTER TABLE statistics.user ALTER COLUMN last_updated SET NOT NULL; +ALTER TABLE statistics.user ALTER COLUMN last_updated SET DEFAULT NOW(); + +ALTER TABLE statistics.artist ALTER COLUMN last_updated SET NOT NULL; +ALTER TABLE statistics.artist ALTER COLUMN last_updated SET DEFAULT NOW(); + + +ALTER TABLE statistics.release ALTER COLUMN last_updated SET NOT NULL; +ALTER TABLE statistics.release ALTER COLUMN last_updated SET DEFAULT NOW(); + +ALTER TABLE statistics.recording ALTER COLUMN last_updated SET NOT NULL; +ALTER TABLE statistics.recording ALTER COLUMN last_updated SET DEFAULT NOW(); + +COMMIT; From 3b4606cd835d66ceb9ffbb2ba9db49340a890314 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Fri, 4 Aug 2017 00:41:08 +0530 Subject: [PATCH 06/20] Comment to think about --- admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql | 3 +++ 1 file changed, 3 insertions(+) diff --git a/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql b/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql index 3a345bdba4..2d82281a16 100644 --- a/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql +++ b/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql @@ -1,5 +1,8 @@ BEGIN; +-- XXX(param): What should values that are null already be set to here now? +-- timestamp 0 or NOW() ? + ALTER TABLE statistics.user ALTER COLUMN last_updated SET NOT NULL; ALTER TABLE statistics.user ALTER COLUMN last_updated SET DEFAULT NOW(); From 8b6f3ec92cd860dab1dc9d810f733ee768a32438 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Fri, 4 Aug 2017 00:41:41 +0530 Subject: [PATCH 07/20] Add code for inserting calculated stats into db --- listenbrainz/db/stats.py | 47 +++++++++++++++++++ listenbrainz/db/tests/test_stats.py | 47 +++++++++++++++++++ listenbrainz/testdata/user_top_artists.json | 14 ++++++ .../testdata/user_top_recordings.json | 14 ++++++ listenbrainz/testdata/user_top_releases.json | 14 ++++++ 5 files changed, 136 insertions(+) create mode 100644 listenbrainz/db/stats.py create mode 100644 listenbrainz/db/tests/test_stats.py create mode 100644 listenbrainz/testdata/user_top_artists.json create mode 100644 listenbrainz/testdata/user_top_recordings.json create mode 100644 listenbrainz/testdata/user_top_releases.json diff --git a/listenbrainz/db/stats.py b/listenbrainz/db/stats.py new file mode 100644 index 0000000000..5b9df1f2b1 --- /dev/null +++ b/listenbrainz/db/stats.py @@ -0,0 +1,47 @@ +"""This module contains functions to insert and retrieve statistics + calculated from Google BigQuery into the database. +""" + +import sqlalchemy +import ujson +from listenbrainz import db + + +# XXX(param): think about the names of these variables +# should they be artist_stats and so on instead? +# Note: names are used in tests and stats/calculate.py also + +def insert_user_stats(user_id, artists, recordings, releases): + # XXX(param): should this name be upsert_user_stats? + + with db.engine.connect() as connection: + connection.execute(sqlalchemy.text(""" + INSERT INTO statistics.user (user_id, artists, recordings, releases) + VALUES (:user_id, :artists, :recordings, :releases) + ON CONFLICT (user_id) + DO UPDATE SET artists = :artists, + recordings = :recordings, + releases = :releases, + last_updated = NOW() + """), { + 'user_id': user_id, + 'artists': ujson.dumps(artists), + 'recordings': ujson.dumps(recordings), + 'releases': ujson.dumps(releases) + } + ) + + +def get_user_stats(user_id): + + with db.engine.connect() as connection: + result = connection.execute(sqlalchemy.text(""" + SELECT user_id, artists, releases, recordings + FROM statistics.user + WHERE user_id = :user_id + """), { + 'user_id': user_id + } + ) + row = result.fetchone() + return dict(row) if row else None diff --git a/listenbrainz/db/tests/test_stats.py b/listenbrainz/db/tests/test_stats.py new file mode 100644 index 0000000000..a97c54f52f --- /dev/null +++ b/listenbrainz/db/tests/test_stats.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +import time +import uuid +import json +import os +import listenbrainz.db.user as db_user +import listenbrainz.db.stats as db_stats +from listenbrainz.db.testing import DatabaseTestCase + + +class StatsDatabaseTestCase(DatabaseTestCase): + + TEST_DATA_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'testdata') + + def setUp(self): + DatabaseTestCase.setUp(self) + self.user = db_user.get_or_create('stats_user') + + def path_to_data_file(self, filename): + # XXX(param): we have this function in IntegrationTestCase also, + # maybe find some way to share it + # ListenBrainzTestCase? + return os.path.join(StatsDatabaseTestCase.TEST_DATA_PATH, filename) + + def test_insert_user_stats(self): + + with open(self.path_to_data_file('user_top_artists.json')) as f: + artists = json.load(f) + with open(self.path_to_data_file('user_top_releases.json')) as f: + releases = json.load(f) + with open(self.path_to_data_file('user_top_recordings.json')) as f: + recordings = json.load(f) + + + db_stats.insert_user_stats( + user_id=self.user['id'], + artists=artists, + recordings=recordings, + releases=releases + ) + + # TODO(param): test the last_updated values too + result = db_stats.get_user_stats(user_id=self.user['id']) + self.assertDictEqual(result['artists'], artists) + self.assertDictEqual(result['releases'], releases) + self.assertDictEqual(result['recordings'], recordings) + diff --git a/listenbrainz/testdata/user_top_artists.json b/listenbrainz/testdata/user_top_artists.json new file mode 100644 index 0000000000..6dfdc2e4f5 --- /dev/null +++ b/listenbrainz/testdata/user_top_artists.json @@ -0,0 +1,14 @@ +{ + "all_time": [ + { + "listen_count": 230, + "artist_msid": "ed6c388e-ea65-431f-8be5-4959239d8c65", + "name": "Kanye West" + }, + { + "listen_count": 229, + "artist_msid": "80ca06c7-07fb-4b3a-a315-ad584cc8eeb0", + "name": "Frank Ocean" + } + ] +} diff --git a/listenbrainz/testdata/user_top_recordings.json b/listenbrainz/testdata/user_top_recordings.json new file mode 100644 index 0000000000..2101d15058 --- /dev/null +++ b/listenbrainz/testdata/user_top_recordings.json @@ -0,0 +1,14 @@ +{ + "all_time": [ + { + "listen_count": 230, + "artist_msid": "59f4d2c6-ca76-4dc7-80de-c3fef6b51211", + "name": "Fade" + }, + { + "listen_count": 229, + "artist_msid": "c2375e2e-e98a-4c6b-a139-3aae0a831ebf", + "name": "Nikes" + } + ] +} diff --git a/listenbrainz/testdata/user_top_releases.json b/listenbrainz/testdata/user_top_releases.json new file mode 100644 index 0000000000..c0306d6282 --- /dev/null +++ b/listenbrainz/testdata/user_top_releases.json @@ -0,0 +1,14 @@ +{ + "all_time": [ + { + "listen_count": 230, + "artist_msid": "19eadc02-1f64-48fe-bb4c-33732b96f7b1", + "name": "The Life Of Pablo" + }, + { + "listen_count": 229, + "artist_msid": "155fa8f8-ef04-471d-ab14-dd21838338d7", + "name": "Blonde" + } + ] +} From 0a9a3f438b60ece5b89d6bf0a6ac30c8ec06d9b1 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Fri, 4 Aug 2017 01:18:33 +0530 Subject: [PATCH 08/20] First cut of adding jobs to scheduler --- listenbrainz/config.py.sample | 3 ++- listenbrainz/stats/calculate.py | 4 ++-- listenbrainz/webserver/scheduler.py | 7 ++++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/listenbrainz/config.py.sample b/listenbrainz/config.py.sample index dc7705a101..aa5f18e030 100644 --- a/listenbrainz/config.py.sample +++ b/listenbrainz/config.py.sample @@ -56,7 +56,8 @@ BIGQUERY_TABLE_ID = "listen" STATS_ENTITY_LIMIT = 100 # the number of entities to calculate at max with BQ # Stats -STATS_CALCULATE_LOGIN = 7 +STATS_CALCULATE_LOGIN = 30 +STATS_CALCULATION_INTERVAL = 7 # Max time in seconds after which the playing_now stream will expire. diff --git a/listenbrainz/stats/calculate.py b/listenbrainz/stats/calculate.py index c48ed0e9c2..9dd4059895 100644 --- a/listenbrainz/stats/calculate.py +++ b/listenbrainz/stats/calculate.py @@ -16,6 +16,6 @@ def calculate_user_stats(): releases=releases, ) -def calculate(): - pass +def calculate_stats(): + calculate_user_stats() diff --git a/listenbrainz/webserver/scheduler.py b/listenbrainz/webserver/scheduler.py index 0f78333657..c2f43af4f1 100644 --- a/listenbrainz/webserver/scheduler.py +++ b/listenbrainz/webserver/scheduler.py @@ -3,9 +3,10 @@ """ import logging from apscheduler.schedulers.background import BackgroundScheduler +from listenbrainz.stats.calculate import calculate_stats class ScheduledJobs(): - """ Schedule the scripts that need to be run at scheduled intervals """ + """Schedule the scripts that need to be run at regular intervals """ def __init__(self, conf): self.log = logging.getLogger(__name__) @@ -25,5 +26,5 @@ def run(self): self.scheduler.shutdown() def add_jobs(self): - """ Add the jobs that need to be run to the scheduler """ - pass + """Add the jobs that need to be run to the scheduler""" + self.scheduler.add_job(calculate_stats, 'interval', days=conf['STATS_CALCULATION_INTERVAL']) From 12528243d0d4392504d0143bb6c96fb19d6de820 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Tue, 8 Aug 2017 13:46:57 +0530 Subject: [PATCH 09/20] Add a script that can be run to manually calculate stats Also fix a couple bugs in the code --- listenbrainz/stats/calculate.py | 14 +++++++++++++- listenbrainz/webserver/scheduler.py | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/listenbrainz/stats/calculate.py b/listenbrainz/stats/calculate.py index 9dd4059895..b4d9e6c428 100644 --- a/listenbrainz/stats/calculate.py +++ b/listenbrainz/stats/calculate.py @@ -1,10 +1,13 @@ import listenbrainz.stats.user as stats_user import listenbrainz.db.user as db_user import listenbrainz.db.stats as db_stats +from listenbrainz import db +from listenbrainz import config +from listenbrainz import stats def calculate_user_stats(): - for user in get_recently_logged_in_users(): + for user in db_user.get_recently_logged_in_users(): recordings = stats_user.get_top_recordings(musicbrainz_id=user['musicbrainz_id']) artists = stats_user.get_top_artists(musicbrainz_id=user['musicbrainz_id']) releases = stats_user.get_top_releases(musicbrainz_id=user['musicbrainz_id']) @@ -19,3 +22,12 @@ def calculate_user_stats(): def calculate_stats(): calculate_user_stats() +if __name__ == '__main__': + print('Connecting to Google BigQuery...') + stats.init_bigquery_connection() + print('Connecting to database...') + db.init_db_connection(config.SQLALCHEMY_DATABASE_URI) + print('Connected!') + print('Calculating statistics using Google BigQuery...') + calculate_stats() + print('Calculations done!') diff --git a/listenbrainz/webserver/scheduler.py b/listenbrainz/webserver/scheduler.py index c2f43af4f1..dc592229b9 100644 --- a/listenbrainz/webserver/scheduler.py +++ b/listenbrainz/webserver/scheduler.py @@ -27,4 +27,4 @@ def run(self): def add_jobs(self): """Add the jobs that need to be run to the scheduler""" - self.scheduler.add_job(calculate_stats, 'interval', days=conf['STATS_CALCULATION_INTERVAL']) + self.scheduler.add_job(calculate_stats, 'interval', days=self.conf['STATS_CALCULATION_INTERVAL']) From f91488f7661e6ee266a3873acb9af02938913c9d Mon Sep 17 00:00:00 2001 From: Param Singh Date: Tue, 8 Aug 2017 22:46:46 +0530 Subject: [PATCH 10/20] Show artist count on user page Ignore the styling for now, this is more of a proof-of-concept. --- listenbrainz/webserver/templates/user/profile.html | 2 ++ listenbrainz/webserver/views/user.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/listenbrainz/webserver/templates/user/profile.html b/listenbrainz/webserver/templates/user/profile.html index 9e5fd0ee62..8c9ccb0bdd 100644 --- a/listenbrainz/webserver/templates/user/profile.html +++ b/listenbrainz/webserver/templates/user/profile.html @@ -17,6 +17,8 @@

Listen Count: {{ listen_count }}

We were not able to get listen count for {{ user.musicbrainz_id }} due to an error. Please reload to try again. {% endif %} +

Artist Count: {{ artist_count }}

+

Recent listens

{% if not listens %} diff --git a/listenbrainz/webserver/views/user.py b/listenbrainz/webserver/views/user.py index a7f623494d..0a859f727a 100644 --- a/listenbrainz/webserver/views/user.py +++ b/listenbrainz/webserver/views/user.py @@ -1,4 +1,3 @@ - from flask import Blueprint, render_template, request, url_for, Response, redirect, flash, current_app, jsonify from flask_login import current_user, login_required from werkzeug.exceptions import NotFound, BadRequest, RequestEntityTooLarge, InternalServerError @@ -9,6 +8,7 @@ from listenbrainz import webserver from listenbrainz.webserver import flash import listenbrainz.db.user as db_user +import listenbrainz.db.stats as db_stats import listenbrainz.config as config from listenbrainz.db.exceptions import DatabaseException from flask import make_response @@ -152,6 +152,8 @@ def profile(user_name): } listens.insert(0, listen) + user_stats = db_stats.get_user_stats(user.id) + return render_template( "user/profile.html", user=user, @@ -161,6 +163,7 @@ def profile(user_name): spotify_uri=_get_spotify_uri_for_listens(listens), have_listen_count=have_listen_count, listen_count=format(int(listen_count), ",d"), + artist_count=format(len(user_stats.get("artists", [])), ",d") ) From addea60fb5dd43fc831d5ec5ee45d15ae6bc866d Mon Sep 17 00:00:00 2001 From: Param Singh Date: Thu, 17 Aug 2017 16:56:26 +0530 Subject: [PATCH 11/20] Remove extra line that came from rebase --- listenbrainz/config.py.sample | 2 -- 1 file changed, 2 deletions(-) diff --git a/listenbrainz/config.py.sample b/listenbrainz/config.py.sample index aa5f18e030..0675af5abc 100644 --- a/listenbrainz/config.py.sample +++ b/listenbrainz/config.py.sample @@ -54,8 +54,6 @@ BIGQUERY_TABLE_ID = "listen" # Stats STATS_ENTITY_LIMIT = 100 # the number of entities to calculate at max with BQ - -# Stats STATS_CALCULATE_LOGIN = 30 STATS_CALCULATION_INTERVAL = 7 From b9e6120692cc688b3d8e1867918911350893a66e Mon Sep 17 00:00:00 2001 From: Param Singh Date: Fri, 18 Aug 2017 01:06:43 +0530 Subject: [PATCH 12/20] Add different stat function for getting artist_count --- listenbrainz/db/stats.py | 14 +++++++--- listenbrainz/db/tests/test_stats.py | 6 +++-- listenbrainz/stats/calculate.py | 2 ++ listenbrainz/stats/user.py | 40 +++++++++++++++++++++++++++- listenbrainz/webserver/views/user.py | 2 ++ 5 files changed, 58 insertions(+), 6 deletions(-) diff --git a/listenbrainz/db/stats.py b/listenbrainz/db/stats.py index 5b9df1f2b1..8f591e8056 100644 --- a/listenbrainz/db/stats.py +++ b/listenbrainz/db/stats.py @@ -7,13 +7,21 @@ from listenbrainz import db -# XXX(param): think about the names of these variables +# XXX(param): think about the names of these stats variables # should they be artist_stats and so on instead? # Note: names are used in tests and stats/calculate.py also -def insert_user_stats(user_id, artists, recordings, releases): +def insert_user_stats(user_id, artists, recordings, releases, artist_count): # XXX(param): should this name be upsert_user_stats? + # put all artist stats into one dict which will then be inserted + # into the artist column of the stats.user table + # XXX(param): This makes the schema of the stats very variable though. + artist_stats = { + 'count': artist_count, + 'all_time': artists + } + with db.engine.connect() as connection: connection.execute(sqlalchemy.text(""" INSERT INTO statistics.user (user_id, artists, recordings, releases) @@ -25,7 +33,7 @@ def insert_user_stats(user_id, artists, recordings, releases): last_updated = NOW() """), { 'user_id': user_id, - 'artists': ujson.dumps(artists), + 'artists': ujson.dumps(artist_stats), 'recordings': ujson.dumps(recordings), 'releases': ujson.dumps(releases) } diff --git a/listenbrainz/db/tests/test_stats.py b/listenbrainz/db/tests/test_stats.py index a97c54f52f..e9532f5501 100644 --- a/listenbrainz/db/tests/test_stats.py +++ b/listenbrainz/db/tests/test_stats.py @@ -36,12 +36,14 @@ def test_insert_user_stats(self): user_id=self.user['id'], artists=artists, recordings=recordings, - releases=releases + releases=releases, + artist_count=2, ) # TODO(param): test the last_updated values too result = db_stats.get_user_stats(user_id=self.user['id']) - self.assertDictEqual(result['artists'], artists) + self.assertDictEqual(result['artists']['all_time'], artists) + self.assertEqual(result['artists']['count'], 2) self.assertDictEqual(result['releases'], releases) self.assertDictEqual(result['recordings'], recordings) diff --git a/listenbrainz/stats/calculate.py b/listenbrainz/stats/calculate.py index b4d9e6c428..094e7a8003 100644 --- a/listenbrainz/stats/calculate.py +++ b/listenbrainz/stats/calculate.py @@ -11,12 +11,14 @@ def calculate_user_stats(): recordings = stats_user.get_top_recordings(musicbrainz_id=user['musicbrainz_id']) artists = stats_user.get_top_artists(musicbrainz_id=user['musicbrainz_id']) releases = stats_user.get_top_releases(musicbrainz_id=user['musicbrainz_id']) + artist_count = stats_user.get_artist_count(musicbrainz_id=user['musicbrainz_id']) db_stats.insert_user_stats( user_id=user['id'], artists=artists, recordings=recordings, releases=releases, + artist_count=artist_count ) def calculate_stats(): diff --git a/listenbrainz/stats/user.py b/listenbrainz/stats/user.py index 3ef04acc7b..a2e9619c7a 100644 --- a/listenbrainz/stats/user.py +++ b/listenbrainz/stats/user.py @@ -133,11 +133,49 @@ def get_top_artists(musicbrainz_id, time_interval=None): return stats.run_query(query, parameters) + +def get_artist_count(musicbrainz_id, time_interval=None): + """ Get artist count for user with given MusicBrainz ID over a particular period of time + + Args: musicbrainz_id (str): the MusicBrainz ID of the user + time_interval (str): the time interval over which artist count should be returned + (defaults to all time) + + Returns: artist_count (int): total number of artists listened to by the user in that + period of time + """ + + filter_clause = "" + if time_interval: + filter_clause = "AND listened_at >= TIMESTAMP_SUB(CURRENT_TIME(), INTERVAL {})".format(time_interval) + + query = """SELECT COUNT(DISTINCT(artist_msid)) as artist_count + FROM {dataset_id}.{table_id} + WHERE user_name = @musicbrainz_id + {time_filter_clause} + """.format( + dataset_id=config.BIGQUERY_DATASET_ID, + table_id=config.BIGQUERY_TABLE_ID, + time_filter_clause=filter_clause, + ) + + parameters = [ + { + 'name': 'musicbrainz_id', + 'type': 'STRING', + 'value': musicbrainz_id, + } + ] + + return stats.run_query(query, parameters)['listen_count'] + + + def get_top_releases(musicbrainz_id, time_interval=None): """ Get top releases for user with given MusicBrainz ID over a particular period of time Args: musicbrainz_id (str): the MusicBrainz ID of the user - time_interval (str): the time interval over which top artists should be returned + time_interval (str): the time interval over which top releases should be returned (defaults to all time) Returns: A sorted list of dicts with the following structure diff --git a/listenbrainz/webserver/views/user.py b/listenbrainz/webserver/views/user.py index 0a859f727a..a7eea8b000 100644 --- a/listenbrainz/webserver/views/user.py +++ b/listenbrainz/webserver/views/user.py @@ -153,6 +153,8 @@ def profile(user_name): listens.insert(0, listen) user_stats = db_stats.get_user_stats(user.id) + if not user_stats: + user_stats = {} return render_template( "user/profile.html", From 7362d80ad18175cfe420ad59415e9363d777c61d Mon Sep 17 00:00:00 2001 From: Param Singh Date: Tue, 22 Aug 2017 14:34:58 +0530 Subject: [PATCH 13/20] Set NULL values of last_updated in stat tables to 0 --- .../sql/updates/2017-08-03-make-stats-updated-not-null.sql | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql b/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql index 2d82281a16..9a1b30c8b8 100644 --- a/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql +++ b/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql @@ -1,7 +1,9 @@ BEGIN; --- XXX(param): What should values that are null already be set to here now? --- timestamp 0 or NOW() ? +UPDATE statistics.user SET last_updated = to_timestamp(0) WHERE last_updated IS NULL; +UPDATE statistics.artist SET last_updated = to_timestamp(0) WHERE last_updated IS NULL; +UPDATE statistics.release SET last_updated = to_timestamp(0) WHERE last_updated IS NULL; +UPDATE statistics.recording SET last_updated = to_timestamp(0) WHERE last_updated IS NULL; ALTER TABLE statistics.user ALTER COLUMN last_updated SET NOT NULL; ALTER TABLE statistics.user ALTER COLUMN last_updated SET DEFAULT NOW(); From 5b6ff06f6c3dcf339a00390a32a0d7767b4a1012 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Wed, 23 Aug 2017 00:38:34 +0530 Subject: [PATCH 14/20] Show stats in a table on user page --- listenbrainz/stats/user.py | 2 +- .../webserver/templates/user/profile.html | 28 +++++++++++++++---- listenbrainz/webserver/views/user.py | 2 +- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/listenbrainz/stats/user.py b/listenbrainz/stats/user.py index a2e9619c7a..f575560373 100644 --- a/listenbrainz/stats/user.py +++ b/listenbrainz/stats/user.py @@ -167,7 +167,7 @@ def get_artist_count(musicbrainz_id, time_interval=None): } ] - return stats.run_query(query, parameters)['listen_count'] + return stats.run_query(query, parameters)[0]['artist_count'] diff --git a/listenbrainz/webserver/templates/user/profile.html b/listenbrainz/webserver/templates/user/profile.html index 8c9ccb0bdd..78f110945d 100644 --- a/listenbrainz/webserver/templates/user/profile.html +++ b/listenbrainz/webserver/templates/user/profile.html @@ -11,13 +11,29 @@

{{ user.musicbrainz_id }}

See profile on MusicBrainz - {% if have_listen_count %} -

Listen Count: {{ listen_count }}

- {% else %} - We were not able to get listen count for {{ user.musicbrainz_id }} due to an error. Please reload to try again. - {% endif %} -

Artist Count: {{ artist_count }}

+

Statistics

+ +
+
+ + + {% if listen_count %} + + + + + {% endif %} + + {% if artist_count %} + + + + + {% endif %} +
Listen count{{ listen_count }}
Artist count{{ artist_count }}
+
+

Recent listens

diff --git a/listenbrainz/webserver/views/user.py b/listenbrainz/webserver/views/user.py index a7eea8b000..000748d3f4 100644 --- a/listenbrainz/webserver/views/user.py +++ b/listenbrainz/webserver/views/user.py @@ -165,7 +165,7 @@ def profile(user_name): spotify_uri=_get_spotify_uri_for_listens(listens), have_listen_count=have_listen_count, listen_count=format(int(listen_count), ",d"), - artist_count=format(len(user_stats.get("artists", [])), ",d") + artist_count=format(int(user_stats['artists']['count']), ",d") ) From f6eacfda2402c489b4527f4388abe7efbf3bb260 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Wed, 23 Aug 2017 01:06:30 +0530 Subject: [PATCH 15/20] Fix error if stats are not calculated for user --- listenbrainz/webserver/views/user.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/listenbrainz/webserver/views/user.py b/listenbrainz/webserver/views/user.py index 000748d3f4..e0689b2d6e 100644 --- a/listenbrainz/webserver/views/user.py +++ b/listenbrainz/webserver/views/user.py @@ -153,8 +153,10 @@ def profile(user_name): listens.insert(0, listen) user_stats = db_stats.get_user_stats(user.id) - if not user_stats: - user_stats = {} + try: + artist_count = int(user_stats['artists']['count']) + except (KeyError, TypeError): + artist_count = 0 return render_template( "user/profile.html", @@ -165,7 +167,7 @@ def profile(user_name): spotify_uri=_get_spotify_uri_for_listens(listens), have_listen_count=have_listen_count, listen_count=format(int(listen_count), ",d"), - artist_count=format(int(user_stats['artists']['count']), ",d") + artist_count=format(artist_count, ",d") ) From 3b9b04385d1bfdd0712ac5ff7973695a0b694287 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Fri, 1 Sep 2017 13:03:25 +0530 Subject: [PATCH 16/20] Change indentation to spaces in profile.html --- .../webserver/templates/user/profile.html | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/listenbrainz/webserver/templates/user/profile.html b/listenbrainz/webserver/templates/user/profile.html index 78f110945d..c46f0f92ea 100644 --- a/listenbrainz/webserver/templates/user/profile.html +++ b/listenbrainz/webserver/templates/user/profile.html @@ -12,28 +12,28 @@

{{ user.musicbrainz_id }}

-

Statistics

+

Statistics

-
-
- - - {% if listen_count %} - - - - - {% endif %} +
+
+
Listen count{{ listen_count }}
+ + {% if listen_count %} + + + + + {% endif %} - {% if artist_count %} - - - - - {% endif %} -
Listen count{{ listen_count }}
Artist count{{ artist_count }}
-
-
+ {% if artist_count %} + + Artist count + {{ artist_count }} + + {% endif %} + + +

Recent listens

From 4eb5b5c6dbfda3578ca201a0589c2cbb96928ea5 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Fri, 1 Sep 2017 15:22:58 +0530 Subject: [PATCH 17/20] Address TODO comments and add docstrings to new functions Also improve tests and variable names a bit --- listenbrainz/config.py.sample | 4 ++-- listenbrainz/db/stats.py | 32 ++++++++++++++++++++++------- listenbrainz/db/tests/test_stats.py | 7 +------ listenbrainz/db/user.py | 3 +-- 4 files changed, 29 insertions(+), 17 deletions(-) diff --git a/listenbrainz/config.py.sample b/listenbrainz/config.py.sample index 0675af5abc..62ac99a271 100644 --- a/listenbrainz/config.py.sample +++ b/listenbrainz/config.py.sample @@ -54,8 +54,8 @@ BIGQUERY_TABLE_ID = "listen" # Stats STATS_ENTITY_LIMIT = 100 # the number of entities to calculate at max with BQ -STATS_CALCULATE_LOGIN = 30 -STATS_CALCULATION_INTERVAL = 7 +STATS_CALCULATION_LOGIN_TIME = 30 # users must have logged in to LB in the past 30 days for stats to be calculated +STATS_CALCULATION_INTERVAL = 7 # stats are calculated every 7 days # Max time in seconds after which the playing_now stream will expire. diff --git a/listenbrainz/db/stats.py b/listenbrainz/db/stats.py index 8f591e8056..649217f8b1 100644 --- a/listenbrainz/db/stats.py +++ b/listenbrainz/db/stats.py @@ -7,16 +7,21 @@ from listenbrainz import db -# XXX(param): think about the names of these stats variables -# should they be artist_stats and so on instead? -# Note: names are used in tests and stats/calculate.py also - def insert_user_stats(user_id, artists, recordings, releases, artist_count): - # XXX(param): should this name be upsert_user_stats? + """Inserts user stats calculated from Google BigQuery into the database. + + If stats are already present for some user, they are updated to the new + values passed. + + Args: user_id (int): the row id of the user, + artists (dict): the top artists listened to by the user + recordings (dict): the top recordings listened to by the user + releases (dict): the top releases listened to by the user + artist_count (int): the total number of artists listened to by the user + """ # put all artist stats into one dict which will then be inserted # into the artist column of the stats.user table - # XXX(param): This makes the schema of the stats very variable though. artist_stats = { 'count': artist_count, 'all_time': artists @@ -41,10 +46,23 @@ def insert_user_stats(user_id, artists, recordings, releases, artist_count): def get_user_stats(user_id): + """Get user stats for user with given ID. + + Args: user_id (int): the row ID of the user in the DB + + Returns: A dict of the following format + { + "user_id" (int): the id of the user + "artists" (dict): artist stats for the user + "releases" (dict) : release stats for the user + "recordings" (dict): recording stats for the user + "last_updated" (datetime): timestamp when the stats were last updated + } + """ with db.engine.connect() as connection: result = connection.execute(sqlalchemy.text(""" - SELECT user_id, artists, releases, recordings + SELECT user_id, artists, releases, recordings, last_updated FROM statistics.user WHERE user_id = :user_id """), { diff --git a/listenbrainz/db/tests/test_stats.py b/listenbrainz/db/tests/test_stats.py index e9532f5501..32c2382bc7 100644 --- a/listenbrainz/db/tests/test_stats.py +++ b/listenbrainz/db/tests/test_stats.py @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -import time -import uuid import json import os import listenbrainz.db.user as db_user @@ -17,9 +15,6 @@ def setUp(self): self.user = db_user.get_or_create('stats_user') def path_to_data_file(self, filename): - # XXX(param): we have this function in IntegrationTestCase also, - # maybe find some way to share it - # ListenBrainzTestCase? return os.path.join(StatsDatabaseTestCase.TEST_DATA_PATH, filename) def test_insert_user_stats(self): @@ -40,10 +35,10 @@ def test_insert_user_stats(self): artist_count=2, ) - # TODO(param): test the last_updated values too result = db_stats.get_user_stats(user_id=self.user['id']) self.assertDictEqual(result['artists']['all_time'], artists) self.assertEqual(result['artists']['count'], 2) self.assertDictEqual(result['releases'], releases) self.assertDictEqual(result['recordings'], recordings) + self.assertGreater(int(result['last_updated'].strftime('%s')), 0) diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py index 57cfd5838b..d0c63a56e0 100644 --- a/listenbrainz/db/user.py +++ b/listenbrainz/db/user.py @@ -225,7 +225,6 @@ def get_recently_logged_in_users(): FROM "user" WHERE last_login >= NOW() - INTERVAL ':x days' """.format(columns=','.join(USER_GET_COLUMNS))), { - #TODO(param): think about the name of this variable in config - 'x': config.STATS_CALCULATE_LOGIN + 'x': config.STATS_CALCULATION_LOGIN_TIME }) return [dict(row) for row in result] From 54e80b68449ac54fdaaf29376ee3f0cfefe4ff87 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Thu, 5 Oct 2017 21:48:13 +0530 Subject: [PATCH 18/20] Don't align equals signs --- listenbrainz/stats/calculate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/listenbrainz/stats/calculate.py b/listenbrainz/stats/calculate.py index 094e7a8003..13502c5baa 100644 --- a/listenbrainz/stats/calculate.py +++ b/listenbrainz/stats/calculate.py @@ -9,8 +9,8 @@ def calculate_user_stats(): for user in db_user.get_recently_logged_in_users(): recordings = stats_user.get_top_recordings(musicbrainz_id=user['musicbrainz_id']) - artists = stats_user.get_top_artists(musicbrainz_id=user['musicbrainz_id']) - releases = stats_user.get_top_releases(musicbrainz_id=user['musicbrainz_id']) + artists = stats_user.get_top_artists(musicbrainz_id=user['musicbrainz_id']) + releases = stats_user.get_top_releases(musicbrainz_id=user['musicbrainz_id']) artist_count = stats_user.get_artist_count(musicbrainz_id=user['musicbrainz_id']) db_stats.insert_user_stats( From 18de1960ce9dd5f29c49ce4352d829c0f3b19f55 Mon Sep 17 00:00:00 2001 From: Param Singh Date: Thu, 5 Oct 2017 22:21:48 +0530 Subject: [PATCH 19/20] Change docstring to better english --- listenbrainz/db/user.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py index d0c63a56e0..a147af63a9 100644 --- a/listenbrainz/db/user.py +++ b/listenbrainz/db/user.py @@ -216,8 +216,8 @@ def update_latest_import(musicbrainz_id, ts): def get_recently_logged_in_users(): - """Returns a list of users who have logged-in in the last X days, X is - defined in the config. + """Returns a list of users who have logged-in in the + last config.STATS_CALCULATION_LOGIN_TIME days """ with db.engine.connect() as connection: result = connection.execute(sqlalchemy.text(""" From 650d079cbc6c3441b93ddf3ed2cd4742d13cbd2e Mon Sep 17 00:00:00 2001 From: Param Singh Date: Thu, 5 Oct 2017 22:55:35 +0530 Subject: [PATCH 20/20] Put the bigquery initialization code into a module This way, both bigquery-writer and the stats stuff can use the same initialization code. --- .../bigquery-writer/bigquery-writer.py | 18 ++++------ listenbrainz/bigquery.py | 33 +++++++++++++++++++ listenbrainz/stats/__init__.py | 18 ++-------- listenbrainz/webserver/scheduler.py | 4 ++- 4 files changed, 44 insertions(+), 29 deletions(-) create mode 100644 listenbrainz/bigquery.py diff --git a/listenbrainz/bigquery-writer/bigquery-writer.py b/listenbrainz/bigquery-writer/bigquery-writer.py index 6689f916c8..2133dfc940 100755 --- a/listenbrainz/bigquery-writer/bigquery-writer.py +++ b/listenbrainz/bigquery-writer/bigquery-writer.py @@ -12,10 +12,11 @@ from googleapiclient import discovery from googleapiclient.errors import HttpError +from listenbrainz.bigquery import create_bigquery_object +from listenbrainz.bigquery import NoCredentialsVariableException, NoCredentialsFileException from oauth2client.client import GoogleCredentials REPORT_FREQUENCY = 5000 -APP_CREDENTIALS_FILE = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') ERROR_RETRY_DELAY = 3 # number of seconds to wait until retrying an operation DUMP_JSON_WITH_ERRORS = True @@ -153,18 +154,11 @@ def start(self): sleep(66666) return - if not APP_CREDENTIALS_FILE: - self.log.error("BiqQueryWriter not started, the GOOGLE_APPLICATION_CREDENTIALS env var is not defined.") + try: + self.bigquery = create_bigquery_object() + except (NoCredentialsFileException, NoCredentialsVariableException): + self.log.error("Credential File not present or invalid! Sleeping...") sleep(1000) - return - - if not os.path.exists(APP_CREDENTIALS_FILE): - self.log.error("BiqQueryWriter not started, %s is missing." % APP_CREDENTIALS_FILE) - sleep(1000) - return - - credentials = GoogleCredentials.get_application_default() - self.bigquery = discovery.build('bigquery', 'v2', credentials=credentials) while True: try: diff --git a/listenbrainz/bigquery.py b/listenbrainz/bigquery.py new file mode 100644 index 0000000000..a6add0e5f1 --- /dev/null +++ b/listenbrainz/bigquery.py @@ -0,0 +1,33 @@ +import os +from googleapiclient import discovery +import googleapiclient +from oauth2client.client import GoogleCredentials + +APP_CREDENTIALS_FILE = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') + +def create_bigquery_object(): + """ Initiates the connection to Google BigQuery. Returns a BigQuery object. """ + + if not APP_CREDENTIALS_FILE: + logger.error("The GOOGLE_APPLICATIONS_CREDENTIALS variable is undefined, cannot connect to BigQuery") + raise NoCredentialsVariableException + + if not os.path.exists(APP_CREDENTIALS_FILE): + logger.error("The BigQuery credentials file does not exist, cannot connect to BigQuery") + raise NoCredentialsFileException + + credentials = GoogleCredentials.get_application_default() + return discovery.build('bigquery', 'v2', credentials=credentials) + + +# Exceptions +class BigQueryException(Exception): + pass + + +class NoCredentialsVariableException(BigQueryException): + pass + + +class NoCredentialsFileException(BigQueryException): + pass diff --git a/listenbrainz/stats/__init__.py b/listenbrainz/stats/__init__.py index 98c3ffd9de..3cce86f510 100644 --- a/listenbrainz/stats/__init__.py +++ b/listenbrainz/stats/__init__.py @@ -1,9 +1,6 @@ -from googleapiclient import discovery -import googleapiclient -from oauth2client.client import GoogleCredentials import os import logging -from listenbrainz.stats.exceptions import NoCredentialsVariableException, NoCredentialsFileException +from listenbrainz.bigquery import create_bigquery_object import listenbrainz.config as config import time @@ -12,25 +9,14 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -APP_CREDENTIALS_FILE = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS') - bigquery = None def init_bigquery_connection(): """ Initiates the connection to Google BigQuery """ - if not APP_CREDENTIALS_FILE: - logger.error("The GOOGLE_APPLICATIONS_CREDENTIALS variable is undefined, cannot connect to BigQuery") - raise NoCredentialsVariableException - - if not os.path.exists(APP_CREDENTIALS_FILE): - logger.error("The BigQuery credentials file does not exist, cannot connect to BigQuery") - raise NoCredentialsFileException - global bigquery - credentials = GoogleCredentials.get_application_default() - bigquery = discovery.build('bigquery', 'v2', credentials=credentials) + bigquery = create_bigquery_object() def get_parameters_dict(parameters): diff --git a/listenbrainz/webserver/scheduler.py b/listenbrainz/webserver/scheduler.py index dc592229b9..6a58c82382 100644 --- a/listenbrainz/webserver/scheduler.py +++ b/listenbrainz/webserver/scheduler.py @@ -1,9 +1,10 @@ """ This module contains code that triggers jobs we want to run on regular intervals. """ -import logging from apscheduler.schedulers.background import BackgroundScheduler +from listenbrainz import stats from listenbrainz.stats.calculate import calculate_stats +import logging class ScheduledJobs(): """Schedule the scripts that need to be run at regular intervals """ @@ -15,6 +16,7 @@ def __init__(self, conf): self.conf = conf self.scheduler = BackgroundScheduler() + stats.init_bigquery_connection() self.add_jobs() self.run()