metabrainz · mayhem · Oct 10, 2017 · Jul 3, 2017 · Aug 2, 2017 · Aug 3, 2017
diff --git a/admin/sql/create_tables.sql b/admin/sql/create_tables.sql
@@ -49,7 +49,7 @@ CREATE TABLE statistics.user (
     artists                 JSONB,
     releases                JSONB,
     recordings              JSONB,
-    last_updated            TIMESTAMP WITH TIME ZONE
+    last_updated            TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 
 CREATE TABLE statistics.artist (
@@ -60,7 +60,7 @@ CREATE TABLE statistics.artist (
     recordings              JSONB,
     users                   JSONB,
     listen_count            JSONB,
-    last_updated            TIMESTAMP WITH TIME ZONE
+    last_updated            TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 ALTER TABLE statistics.artist ADD CONSTRAINT artist_stats_msid_uniq UNIQUE (msid);
 
@@ -71,7 +71,7 @@ CREATE TABLE statistics.release (
     recordings              JSONB,
     users                   JSONB,
     listen_count            JSONB,
-    last_updated            TIMESTAMP WITH TIME ZONE
+    last_updated            TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 ALTER TABLE statistics.release ADD CONSTRAINT release_stats_msid_uniq UNIQUE (msid);
 
@@ -81,7 +81,7 @@ CREATE TABLE statistics.recording (
     name                    VARCHAR,
     users_all_time          JSONB,
     listen_count            JSONB,
-    last_updated            TIMESTAMP WITH TIME ZONE
+    last_updated            TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 
 );
 ALTER TABLE statistics.recording ADD CONSTRAINT recording_stats_msid_uniq UNIQUE (msid);

diff --git a/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql b/admin/sql/updates/2017-08-03-make-stats-updated-not-null.sql
@@ -0,0 +1,19 @@
+BEGIN;
+
+-- XXX(param): What should values that are null already be set to here now?
+-- timestamp 0 or NOW() ?
+
+ALTER TABLE statistics.user ALTER COLUMN last_updated SET NOT NULL;
+ALTER TABLE statistics.user ALTER COLUMN last_updated SET DEFAULT NOW();
+
+ALTER TABLE statistics.artist ALTER COLUMN last_updated SET NOT NULL;
+ALTER TABLE statistics.artist ALTER COLUMN last_updated SET DEFAULT NOW();
+
+
+ALTER TABLE statistics.release ALTER COLUMN last_updated SET NOT NULL;
+ALTER TABLE statistics.release ALTER COLUMN last_updated SET DEFAULT NOW();
+
+ALTER TABLE statistics.recording ALTER COLUMN last_updated SET NOT NULL;
+ALTER TABLE statistics.recording ALTER COLUMN last_updated SET DEFAULT NOW();
+
+COMMIT;
diff --git a/listenbrainz/config.py.sample b/listenbrainz/config.py.sample
@@ -54,6 +54,9 @@ BIGQUERY_TABLE_ID = "listen"
 
 # Stats
 STATS_ENTITY_LIMIT = 100 # the number of entities to calculate at max with BQ
+STATS_CALCULATE_LOGIN = 30
+STATS_CALCULATION_INTERVAL = 7
+
 
 # Max time in seconds after which the playing_now stream will expire.
 PLAYING_NOW_MAX_DURATION = 10 * 60

diff --git a/listenbrainz/db/stats.py b/listenbrainz/db/stats.py
@@ -0,0 +1,55 @@
+"""This module contains functions to insert and retrieve statistics
+   calculated from Google BigQuery into the database.
+"""
+
+import sqlalchemy
+import ujson
+from listenbrainz import db
+
+
+# XXX(param): think about the names of these stats variables
+# should they be artist_stats and so on instead?
+# Note: names are used in tests and stats/calculate.py also
+
+def insert_user_stats(user_id, artists, recordings, releases, artist_count):
+    # XXX(param): should this name be upsert_user_stats?
+
+    # put all artist stats into one dict which will then be inserted
+    # into the artist column of the stats.user table
+    # XXX(param): This makes the schema of the stats very variable though.
+    artist_stats = {
+        'count': artist_count,
+        'all_time': artists
+    }
+
+    with db.engine.connect() as connection:
+        connection.execute(sqlalchemy.text("""
+            INSERT INTO statistics.user (user_id, artists, recordings, releases)
+                 VALUES (:user_id, :artists, :recordings, :releases)
+            ON CONFLICT (user_id)
+          DO UPDATE SET artists = :artists,
+                        recordings = :recordings,
+                        releases = :releases,
+                        last_updated = NOW()
+            """), {
+                'user_id': user_id,
+                'artists': ujson.dumps(artist_stats),
+                'recordings': ujson.dumps(recordings),
+                'releases': ujson.dumps(releases)
+            }
+        )
+
+
+def get_user_stats(user_id):
+
+    with db.engine.connect() as connection:
+        result = connection.execute(sqlalchemy.text("""
+            SELECT user_id, artists, releases, recordings
+              FROM statistics.user
+             WHERE user_id = :user_id
+            """), {
+                'user_id': user_id
+            }
+        )
+        row = result.fetchone()
+        return dict(row) if row else None
diff --git a/listenbrainz/db/tests/test_stats.py b/listenbrainz/db/tests/test_stats.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+import time
+import uuid
+import json
+import os
+import listenbrainz.db.user as db_user
+import listenbrainz.db.stats as db_stats
+from listenbrainz.db.testing import DatabaseTestCase
+
+
+class StatsDatabaseTestCase(DatabaseTestCase):
+
+    TEST_DATA_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'testdata')
+
+    def setUp(self):
+        DatabaseTestCase.setUp(self)
+        self.user = db_user.get_or_create('stats_user')
+
+    def path_to_data_file(self, filename):
+        # XXX(param): we have this function in IntegrationTestCase also,
+        # maybe find some way to share it
+        # ListenBrainzTestCase?
+       return os.path.join(StatsDatabaseTestCase.TEST_DATA_PATH, filename)
+
+    def test_insert_user_stats(self):
+
+        with open(self.path_to_data_file('user_top_artists.json')) as f:
+            artists = json.load(f)
+        with open(self.path_to_data_file('user_top_releases.json')) as f:
+            releases = json.load(f)
+        with open(self.path_to_data_file('user_top_recordings.json')) as f:
+            recordings = json.load(f)
+
+
+        db_stats.insert_user_stats(
+            user_id=self.user['id'],
+            artists=artists,
+            recordings=recordings,
+            releases=releases,
+            artist_count=2,
+        )
+
+        # TODO(param): test the last_updated values too
+        result = db_stats.get_user_stats(user_id=self.user['id'])
+        self.assertDictEqual(result['artists']['all_time'], artists)
+        self.assertEqual(result['artists']['count'], 2)
+        self.assertDictEqual(result['releases'], releases)
+        self.assertDictEqual(result['recordings'], recordings)
+
diff --git a/listenbrainz/db/tests/test_user.py b/listenbrainz/db/tests/test_user.py
@@ -1,10 +1,9 @@
 # -*- coding: utf-8 -*-
-from listenbrainz.db.testing import DatabaseTestCase
-import listenbrainz.db.user as db_user
-from listenbrainz import db
 import time
 import sqlalchemy
-
+from listenbrainz import db
+from listenbrainz.db.testing import DatabaseTestCase
+import listenbrainz.db.user as db_user
 
 class UserTestCase(DatabaseTestCase):
 
@@ -59,3 +58,34 @@ def test_update_latest_import(self):
         db_user.update_latest_import(user['musicbrainz_id'], val)
         user = db_user.get_by_mb_id(user['musicbrainz_id'])
         self.assertEqual(val, int(user['latest_import'].strftime('%s')))
+
+    def test_get_recently_logged_in_users(self):
+        """Tests getting recently logged in users"""
+
+        # create two users, set one's last_login
+        # to a very old value and one's last_login
+        # to now and then call get_recently_logged_in_users
+        user1 = db_user.get_or_create('recentuser1')
+        with db.engine.connect() as connection:
+            connection.execute(sqlalchemy.text("""
+                UPDATE "user"
+                   SET last_login = to_timestamp(0)
+                 WHERE musicbrainz_id = :musicbrainz_id
+                """), {
+                    'musicbrainz_id': 'recentuser1'
+                })
+
+        user2 = db_user.get_or_create('recentuser2')
+        with db.engine.connect() as connection:
+            connection.execute(sqlalchemy.text("""
+                UPDATE "user"
+                   SET last_login = NOW()
+                 WHERE musicbrainz_id = :musicbrainz_id
+                """), {
+                    'musicbrainz_id': 'recentuser2'
+                })
+
+        recent_users = db_user.get_recently_logged_in_users()
+        self.assertEqual(len(recent_users), 1)
+        self.assertEqual(recent_users[0]['musicbrainz_id'], 'recentuser2')
+
diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py
@@ -1,10 +1,10 @@
-
-from listenbrainz import db
 import uuid
 import sqlalchemy
-from listenbrainz.db.exceptions import DatabaseException
 import logging
 import time
+from listenbrainz import db
+from listenbrainz.db.exceptions import DatabaseException
+from listenbrainz import config
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -213,3 +213,19 @@ def update_latest_import(musicbrainz_id, ts):
             except sqlalchemy.exc.ProgrammingError as e:
                 logger.error(e)
                 raise DatabaseException
+
+
+def get_recently_logged_in_users():
+    """Returns a list of users who have logged-in in the last X days, X is
+       defined in the config.
+    """
+    with db.engine.connect() as connection:
+        result = connection.execute(sqlalchemy.text("""
+                SELECT {columns}
+                  FROM "user"
+                 WHERE last_login >= NOW() - INTERVAL ':x days'
+                """.format(columns=','.join(USER_GET_COLUMNS))), {
+                    #TODO(param): think about the name of this variable in config
+                    'x': config.STATS_CALCULATE_LOGIN
+                })
+        return [dict(row) for row in result]
diff --git a/listenbrainz/stats/calculate.py b/listenbrainz/stats/calculate.py
@@ -0,0 +1,35 @@
+import listenbrainz.stats.user as stats_user
+import listenbrainz.db.user as db_user
+import listenbrainz.db.stats as db_stats
+from listenbrainz import db
+from listenbrainz import config
+from listenbrainz import stats
+
+
+def calculate_user_stats():
+    for user in db_user.get_recently_logged_in_users():
+        recordings = stats_user.get_top_recordings(musicbrainz_id=user['musicbrainz_id'])
+        artists    = stats_user.get_top_artists(musicbrainz_id=user['musicbrainz_id'])
+        releases   = stats_user.get_top_releases(musicbrainz_id=user['musicbrainz_id'])
+        artist_count = stats_user.get_artist_count(musicbrainz_id=user['musicbrainz_id'])
+
+        db_stats.insert_user_stats(
+            user_id=user['id'],
+            artists=artists,
+            recordings=recordings,
+            releases=releases,
+            artist_count=artist_count
+        )
+
+def calculate_stats():
+    calculate_user_stats()
+
+if __name__ == '__main__':
+    print('Connecting to Google BigQuery...')
+    stats.init_bigquery_connection()
+    print('Connecting to database...')
+    db.init_db_connection(config.SQLALCHEMY_DATABASE_URI)
+    print('Connected!')
+    print('Calculating statistics using Google BigQuery...')
+    calculate_stats()
+    print('Calculations done!')
diff --git a/listenbrainz/stats/user.py b/listenbrainz/stats/user.py
@@ -133,11 +133,49 @@ def get_top_artists(musicbrainz_id, time_interval=None):
 
     return stats.run_query(query, parameters)
 
+
+def get_artist_count(musicbrainz_id, time_interval=None):
+    """ Get artist count for user with given MusicBrainz ID over a particular period of time 
+
+        Args: musicbrainz_id (str): the MusicBrainz ID of the user
+              time_interval  (str): the time interval over which artist count should be returned
+                                    (defaults to all time)
+
+        Returns: artist_count (int): total number of artists listened to by the user in that
+                                     period of time
+    """
+
+    filter_clause = ""
+    if time_interval:
+        filter_clause = "AND listened_at >= TIMESTAMP_SUB(CURRENT_TIME(), INTERVAL {})".format(time_interval)
+
+    query = """SELECT COUNT(DISTINCT(artist_msid)) as artist_count
+                 FROM {dataset_id}.{table_id}
+                WHERE user_name = @musicbrainz_id
+                {time_filter_clause}
+            """.format(
+                    dataset_id=config.BIGQUERY_DATASET_ID,
+                    table_id=config.BIGQUERY_TABLE_ID,
+                    time_filter_clause=filter_clause,
+                )
+
+    parameters = [
+        {
+            'name': 'musicbrainz_id',
+            'type': 'STRING',
+            'value': musicbrainz_id,
+        }
+    ]
+
+    return stats.run_query(query, parameters)['listen_count']
+
+
+
 def get_top_releases(musicbrainz_id, time_interval=None):
     """ Get top releases for user with given MusicBrainz ID over a particular period of time
 
         Args: musicbrainz_id (str): the MusicBrainz ID of the user
-              time_interval  (str): the time interval over which top artists should be returned
+              time_interval  (str): the time interval over which top releases should be returned
                                     (defaults to all time)
 
         Returns: A sorted list of dicts with the following structure

diff --git a/listenbrainz/testdata/user_top_artists.json b/listenbrainz/testdata/user_top_artists.json
@@ -0,0 +1,14 @@
+{
+    "all_time": [
+        {
+            "listen_count": 230,
+            "artist_msid": "ed6c388e-ea65-431f-8be5-4959239d8c65",
+            "name": "Kanye West"
+        },
+        {
+            "listen_count": 229,
+            "artist_msid": "80ca06c7-07fb-4b3a-a315-ad584cc8eeb0",
+            "name": "Frank Ocean"
+        }
+    ]
+}
diff --git a/listenbrainz/testdata/user_top_recordings.json b/listenbrainz/testdata/user_top_recordings.json
@@ -0,0 +1,14 @@
+{
+    "all_time": [
+        {
+            "listen_count": 230,
+            "artist_msid": "59f4d2c6-ca76-4dc7-80de-c3fef6b51211",
+            "name": "Fade"
+        },
+        {
+            "listen_count": 229,
+            "artist_msid": "c2375e2e-e98a-4c6b-a139-3aae0a831ebf",
+            "name": "Nikes"
+        }
+    ]
+}
diff --git a/listenbrainz/testdata/user_top_releases.json b/listenbrainz/testdata/user_top_releases.json
@@ -0,0 +1,14 @@
+{
+    "all_time": [
+        {
+            "listen_count": 230,
+            "artist_msid": "19eadc02-1f64-48fe-bb4c-33732b96f7b1",
+            "name": "The Life Of Pablo"
+        },
+        {
+            "listen_count": 229,
+            "artist_msid": "155fa8f8-ef04-471d-ab14-dd21838338d7",
+            "name": "Blonde"
+        }
+    ]
+}