In [49]:
import pandas as pd
import csv
import glob
from datetime import datetime

CSV_GLOB_PATTERN = '../plays/*.csv'

POSSIBLE_TIMESTAMP_COLUMN_NAMES = ['uts', 'timestamp']
POSSIBLE_ARTIST_COLUMN_NAMES = ['artist']
POSSIBLE_TITLE_COLUMN_NAMES = ['title', 'track']
POSSIBLE_ALBUM_COLUMN_NAMES = ['album']

PLAYED_AT = 'played_at_utc'
ARTIST_NAME = 'artist_name'
TRACK_TITLE = 'track_title'
ALBUM_TITLE = 'album_title'
DF_HEADER = [PLAYED_AT, ARTIST_NAME, TRACK_TITLE, ALBUM_TITLE]


def get_column_name(df, possible_column_names):
    for n in possible_column_names:
        if n in df:
            return n

        
def timestamp_to_datetime(timestamp):
    return datetime.fromtimestamp(int(timestamp))


def load_data_frame():
    result = pd.DataFrame(columns=DF_HEADER)
    
    for file_path in glob.glob(CSV_GLOB_PATTERN):
        rows = None
        with open(file_path, 'r') as csv_in:
            rows = list(csv.DictReader(csv_in))
        df = pd.DataFrame(rows)

        timestamp_column_name = get_column_name(df, POSSIBLE_TIMESTAMP_COLUMN_NAMES)
        artist_column_name = get_column_name(df, POSSIBLE_ARTIST_COLUMN_NAMES)
        title_column_name = get_column_name(df, POSSIBLE_TITLE_COLUMN_NAMES)
        album_column_name = get_column_name(df, POSSIBLE_ALBUM_COLUMN_NAMES)
        
        df.rename(columns={timestamp_column_name: PLAYED_AT,
                           artist_column_name: ARTIST_NAME,
                           title_column_name: TRACK_TITLE,
                           album_column_name: ALBUM_TITLE}, inplace=True)
        
        result = result.append(df[DF_HEADER])    
    result[PLAYED_AT] = result[PLAYED_AT].apply(timestamp_to_datetime)
    return result

In [50]:
df = load_data_frame()

In [53]:
df.groupby([TRACK_TITLE, ARTIST_NAME])[TRACK_TITLE]\
    .count()\
    .reset_index(name='count').sort_values(['count'], ascending=False)\
    .head(100)

Unnamed: 0,track_title,artist_name,count
20071,The Damage Is Done,Foreigner,384
14782,Of a Lifetime,Journey,165
14025,Naïve,The Kooks,140
22369,Weil du mich liebst?,J-Luv,140
24008,will you miss me,Heatmakerz,139
11506,La La La,LMFAO,132
15744,Please Please Please,Shout Out Louds,130
21954,Walk These Streets,Rakim,130
23705,computiful,Cro,127
11958,Letzeee Gooo (L.A.A.S.A.V.A.S.) (Feat. Laas Un...,Kool Savas,124


In [54]:
df.groupby([ALBUM_TITLE, ARTIST_NAME])[ALBUM_TITLE]\
    .count()\
    .reset_index(name='count').sort_values(['count'], ascending=False)\
    .head(100)

Unnamed: 0,album_title,artist_name,count
11705,tru. (Deluxe Edition),Cro,1153
6720,Minecraft - Volume Alpha,C418,1017
349,,Kollegah,580
7537,Palmen & Freunde,Dexter,574
589,,Tax Awesome,548
492,,RePete,535
6564,Man on the Moon: The End of Day,Kid Cudi,509
9590,Thank Me Later,Drake,459
359,,LMFAO,431
5883,Kollegah,Kollegah,421


In [55]:
df.groupby([ARTIST_NAME])[ARTIST_NAME]\
    .count()\
    .reset_index(name='count').sort_values(['count'], ascending=False)\
    .head(100)

Unnamed: 0,artist_name,count
2867,Kollegah,2501
1509,Drake,1984
2713,Kanye West,1789
1075,Cro,1483
2800,Kid Cudi,1396
779,C418,1148
5601,Wiz Khalifa,1102
4285,Rick Ross,1042
2776,Kendrick Lamar,996
47,50 Cent,904
