New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Schedule calculations and show artist count on user page. #244
Changes from 12 commits
d999917
c3404f3
cade177
6a4ff2d
fe850d3
3b4606c
8b6f3ec
0a9a3f4
1252824
f91488f
addea60
b9e6120
7362d80
5b6ff06
f6eacfd
3b9b043
4eb5b5c
54e80b6
18de196
650d079
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
BEGIN; | ||
|
||
-- XXX(param): What should values that are null already be set to here now? | ||
-- timestamp 0 or NOW() ? | ||
|
||
ALTER TABLE statistics.user ALTER COLUMN last_updated SET NOT NULL; | ||
ALTER TABLE statistics.user ALTER COLUMN last_updated SET DEFAULT NOW(); | ||
|
||
ALTER TABLE statistics.artist ALTER COLUMN last_updated SET NOT NULL; | ||
ALTER TABLE statistics.artist ALTER COLUMN last_updated SET DEFAULT NOW(); | ||
|
||
|
||
ALTER TABLE statistics.release ALTER COLUMN last_updated SET NOT NULL; | ||
ALTER TABLE statistics.release ALTER COLUMN last_updated SET DEFAULT NOW(); | ||
|
||
ALTER TABLE statistics.recording ALTER COLUMN last_updated SET NOT NULL; | ||
ALTER TABLE statistics.recording ALTER COLUMN last_updated SET DEFAULT NOW(); | ||
|
||
COMMIT; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
"""This module contains functions to insert and retrieve statistics | ||
calculated from Google BigQuery into the database. | ||
""" | ||
|
||
import sqlalchemy | ||
import ujson | ||
from listenbrainz import db | ||
|
||
|
||
# XXX(param): think about the names of these stats variables | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. which variables are you referring to? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which is more appropriate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. me thinks artist_stats but the column in the db is artists There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ugh. DB table names should always be singular. Can we still change this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I did this before I became aware of the singular names thing. Can be easily changed, I'll make an issue for it. https://tickets.metabrainz.org/browse/LB-201 |
||
# should they be artist_stats and so on instead? | ||
# Note: names are used in tests and stats/calculate.py also | ||
|
||
def insert_user_stats(user_id, artists, recordings, releases, artist_count): | ||
# XXX(param): should this name be upsert_user_stats? | ||
|
||
# put all artist stats into one dict which will then be inserted | ||
# into the artist column of the stats.user table | ||
# XXX(param): This makes the schema of the stats very variable though. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please elaborate. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have two artist related queries we run for users right now, the results of both of these queries go into the same jsonb field in the db. The jsonb looks like:
If someone were to add a new artist stat for users, it would go in the same field, so I guess we should document the format of these fields somewhere. |
||
artist_stats = { | ||
'count': artist_count, | ||
'all_time': artists | ||
} | ||
|
||
with db.engine.connect() as connection: | ||
connection.execute(sqlalchemy.text(""" | ||
INSERT INTO statistics.user (user_id, artists, recordings, releases) | ||
VALUES (:user_id, :artists, :recordings, :releases) | ||
ON CONFLICT (user_id) | ||
DO UPDATE SET artists = :artists, | ||
recordings = :recordings, | ||
releases = :releases, | ||
last_updated = NOW() | ||
"""), { | ||
'user_id': user_id, | ||
'artists': ujson.dumps(artist_stats), | ||
'recordings': ujson.dumps(recordings), | ||
'releases': ujson.dumps(releases) | ||
} | ||
) | ||
|
||
|
||
def get_user_stats(user_id): | ||
|
||
with db.engine.connect() as connection: | ||
result = connection.execute(sqlalchemy.text(""" | ||
SELECT user_id, artists, releases, recordings | ||
FROM statistics.user | ||
WHERE user_id = :user_id | ||
"""), { | ||
'user_id': user_id | ||
} | ||
) | ||
row = result.fetchone() | ||
return dict(row) if row else None |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# -*- coding: utf-8 -*- | ||
import time | ||
import uuid | ||
import json | ||
import os | ||
import listenbrainz.db.user as db_user | ||
import listenbrainz.db.stats as db_stats | ||
from listenbrainz.db.testing import DatabaseTestCase | ||
|
||
|
||
class StatsDatabaseTestCase(DatabaseTestCase): | ||
|
||
TEST_DATA_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'testdata') | ||
|
||
def setUp(self): | ||
DatabaseTestCase.setUp(self) | ||
self.user = db_user.get_or_create('stats_user') | ||
|
||
def path_to_data_file(self, filename): | ||
# XXX(param): we have this function in IntegrationTestCase also, | ||
# maybe find some way to share it | ||
# ListenBrainzTestCase? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thats probably ok. |
||
return os.path.join(StatsDatabaseTestCase.TEST_DATA_PATH, filename) | ||
|
||
def test_insert_user_stats(self): | ||
|
||
with open(self.path_to_data_file('user_top_artists.json')) as f: | ||
artists = json.load(f) | ||
with open(self.path_to_data_file('user_top_releases.json')) as f: | ||
releases = json.load(f) | ||
with open(self.path_to_data_file('user_top_recordings.json')) as f: | ||
recordings = json.load(f) | ||
|
||
|
||
db_stats.insert_user_stats( | ||
user_id=self.user['id'], | ||
artists=artists, | ||
recordings=recordings, | ||
releases=releases, | ||
artist_count=2, | ||
) | ||
|
||
# TODO(param): test the last_updated values too | ||
result = db_stats.get_user_stats(user_id=self.user['id']) | ||
self.assertDictEqual(result['artists']['all_time'], artists) | ||
self.assertEqual(result['artists']['count'], 2) | ||
self.assertDictEqual(result['releases'], releases) | ||
self.assertDictEqual(result['recordings'], recordings) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,10 @@ | ||
|
||
from listenbrainz import db | ||
import uuid | ||
import sqlalchemy | ||
from listenbrainz.db.exceptions import DatabaseException | ||
import logging | ||
import time | ||
from listenbrainz import db | ||
from listenbrainz.db.exceptions import DatabaseException | ||
from listenbrainz import config | ||
|
||
logger = logging.getLogger(__name__) | ||
logger.setLevel(logging.INFO) | ||
|
@@ -213,3 +213,19 @@ def update_latest_import(musicbrainz_id, ts): | |
except sqlalchemy.exc.ProgrammingError as e: | ||
logger.error(e) | ||
raise DatabaseException | ||
|
||
|
||
def get_recently_logged_in_users(): | ||
"""Returns a list of users who have logged-in in the last X days, X is | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can say |
||
defined in the config. | ||
""" | ||
with db.engine.connect() as connection: | ||
result = connection.execute(sqlalchemy.text(""" | ||
SELECT {columns} | ||
FROM "user" | ||
WHERE last_login >= NOW() - INTERVAL ':x days' | ||
""".format(columns=','.join(USER_GET_COLUMNS))), { | ||
#TODO(param): think about the name of this variable in config | ||
'x': config.STATS_CALCULATE_LOGIN | ||
}) | ||
return [dict(row) for row in result] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import listenbrainz.stats.user as stats_user | ||
import listenbrainz.db.user as db_user | ||
import listenbrainz.db.stats as db_stats | ||
from listenbrainz import db | ||
from listenbrainz import config | ||
from listenbrainz import stats | ||
|
||
|
||
def calculate_user_stats(): | ||
for user in db_user.get_recently_logged_in_users(): | ||
recordings = stats_user.get_top_recordings(musicbrainz_id=user['musicbrainz_id']) | ||
artists = stats_user.get_top_artists(musicbrainz_id=user['musicbrainz_id']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't align equals signs |
||
releases = stats_user.get_top_releases(musicbrainz_id=user['musicbrainz_id']) | ||
artist_count = stats_user.get_artist_count(musicbrainz_id=user['musicbrainz_id']) | ||
|
||
db_stats.insert_user_stats( | ||
user_id=user['id'], | ||
artists=artists, | ||
recordings=recordings, | ||
releases=releases, | ||
artist_count=artist_count | ||
) | ||
|
||
def calculate_stats(): | ||
calculate_user_stats() | ||
|
||
if __name__ == '__main__': | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://tickets.metabrainz.org/browse/LB-214 Overall, this is too simplistic for production. |
||
print('Connecting to Google BigQuery...') | ||
stats.init_bigquery_connection() | ||
print('Connecting to database...') | ||
db.init_db_connection(config.SQLALCHEMY_DATABASE_URI) | ||
print('Connected!') | ||
print('Calculating statistics using Google BigQuery...') | ||
calculate_stats() | ||
print('Calculations done!') |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -133,11 +133,49 @@ def get_top_artists(musicbrainz_id, time_interval=None): | |
|
||
return stats.run_query(query, parameters) | ||
|
||
|
||
def get_artist_count(musicbrainz_id, time_interval=None): | ||
""" Get artist count for user with given MusicBrainz ID over a particular period of time | ||
|
||
Args: musicbrainz_id (str): the MusicBrainz ID of the user | ||
time_interval (str): the time interval over which artist count should be returned | ||
(defaults to all time) | ||
|
||
Returns: artist_count (int): total number of artists listened to by the user in that | ||
period of time | ||
""" | ||
|
||
filter_clause = "" | ||
if time_interval: | ||
filter_clause = "AND listened_at >= TIMESTAMP_SUB(CURRENT_TIME(), INTERVAL {})".format(time_interval) | ||
|
||
query = """SELECT COUNT(DISTINCT(artist_msid)) as artist_count | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just thinking out loud here... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The stats module is where I intended to keep all BigQuery SQL statements. I think that is what you mean by the "bigquery" module? Should I maybe rename it? (or keep a |
||
FROM {dataset_id}.{table_id} | ||
WHERE user_name = @musicbrainz_id | ||
{time_filter_clause} | ||
""".format( | ||
dataset_id=config.BIGQUERY_DATASET_ID, | ||
table_id=config.BIGQUERY_TABLE_ID, | ||
time_filter_clause=filter_clause, | ||
) | ||
|
||
parameters = [ | ||
{ | ||
'name': 'musicbrainz_id', | ||
'type': 'STRING', | ||
'value': musicbrainz_id, | ||
} | ||
] | ||
|
||
return stats.run_query(query, parameters)['listen_count'] | ||
|
||
|
||
|
||
def get_top_releases(musicbrainz_id, time_interval=None): | ||
""" Get top releases for user with given MusicBrainz ID over a particular period of time | ||
|
||
Args: musicbrainz_id (str): the MusicBrainz ID of the user | ||
time_interval (str): the time interval over which top artists should be returned | ||
time_interval (str): the time interval over which top releases should be returned | ||
(defaults to all time) | ||
|
||
Returns: A sorted list of dicts with the following structure | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"all_time": [ | ||
{ | ||
"listen_count": 230, | ||
"artist_msid": "ed6c388e-ea65-431f-8be5-4959239d8c65", | ||
"name": "Kanye West" | ||
}, | ||
{ | ||
"listen_count": 229, | ||
"artist_msid": "80ca06c7-07fb-4b3a-a315-ad584cc8eeb0", | ||
"name": "Frank Ocean" | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"all_time": [ | ||
{ | ||
"listen_count": 230, | ||
"artist_msid": "59f4d2c6-ca76-4dc7-80de-c3fef6b51211", | ||
"name": "Fade" | ||
}, | ||
{ | ||
"listen_count": 229, | ||
"artist_msid": "c2375e2e-e98a-4c6b-a139-3aae0a831ebf", | ||
"name": "Nikes" | ||
} | ||
] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"all_time": [ | ||
{ | ||
"listen_count": 230, | ||
"artist_msid": "19eadc02-1f64-48fe-bb4c-33732b96f7b1", | ||
"name": "The Life Of Pablo" | ||
}, | ||
{ | ||
"listen_count": 229, | ||
"artist_msid": "155fa8f8-ef04-471d-ab14-dd21838338d7", | ||
"name": "Blonde" | ||
} | ||
] | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
timestamp 0