# 07_05: Simulating data

In [None]:
import math
import collections
import dataclasses
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

In [None]:
import random
import faker

In [None]:
fake = faker.Faker()

In [None]:
fake.name()

In [None]:
fake.address()

In [None]:
fake.date_of_birth()

In [None]:
fake.city()

In [None]:
fake.state_abbr()

In [None]:
music_genres = ['Rock', 'Pop', 'Jazz', 'Hip Hop', 'Country', 'Electronic', 'Blues', 
                'Reggae', 'Metal', 'Folk', 'Soul', 'R&B', 'Punk', 'Indie', 'Alternative']

In [None]:
def artist():
    return {'name': fake.name(),
            'dob': pd.to_datetime(fake.date_of_birth(minimum_age=16, maximum_age=80)), # from Python to pandas date object
            'birthplace': f'{fake.city()} {fake.state_abbr()}',
            'genre': fake.random_element(music_genres)}

In [None]:
artist()

In [None]:
artists = pd.DataFrame([artist() for i in range(100)])
artists.index.name = 'artist_id'

In [None]:
artists

In [None]:
def song():
    artist = fake.random_element(artists.index)
    
    return {'title': fake.text(max_nb_chars=20)[:-1], # skip the final period
            'release': pd.to_datetime(fake.past_date(start_date="-2y")), # get dates in the last two years
            'artist_id': artist,
            'genre': artists.loc[artist, 'genre']}

In [None]:
song()

In [None]:
song()

In [None]:
songs = pd.DataFrame([song() for i in range(500)])
songs.index.name = 'song_id'

In [None]:
songs

In [None]:
songs.loc[0].release

In [None]:
duration = np.random.randint(10, 50)

In [None]:
songs.loc[0].release + pd.Timedelta(weeks=duration)

In [None]:
t = pd.date_range(start=songs.loc[0].release,
                  end=songs.loc[0].release + pd.Timedelta(weeks=duration),
                  freq='W-MON')[:duration] # handle case where release is on Monday
t

In [None]:
initial_plays = np.random.randint(500000, 1000000)

In [None]:
cleanplays = initial_plays * np.linspace(1, 1/duration, duration)

In [None]:
noisyplays = cleanplays * np.random.uniform(0.8, 1.2, duration)

In [None]:
pp.figure(figsize=(5,3))
pp.plot(t, cleanplays)
pp.plot(t, noisyplays)
pp.xticks(rotation=30); # angled tick labels are more readable

In [None]:
def history(song_id):
    duration = np.random.randint(10, 50)

    t = pd.date_range(start=songs.loc[song_id].release,
                      end=songs.loc[song_id].release + pd.Timedelta(weeks=duration),
                      freq='W-MON')[:duration]

    initial_plays = np.random.randint(500000, 1000000)
    plays = initial_plays * np.linspace(1, 0, duration) * np.random.uniform(0.8, 1.2, duration)
    
    return pd.DataFrame({'song_id': song_id, 'plays': plays.astype(int), 'date': t})

In [None]:
history(1).head()

In [None]:
histories = pd.concat([history(song_id) for song_id in songs.index], axis=0) \
              .reset_index().rename(columns={'index': 'week'})

In [None]:
histories

In [None]:
histories = histories[(histories.date > pd.Timestamp.today() - pd.Timedelta(weeks=52)) &
                      (histories.date <= pd.Timestamp.today())]

In [None]:
histories = histories.sort_values(['date', 'plays'], ascending=[True, False])

In [None]:
topten = histories.groupby('date').head(10).set_index('date')
topten

In [None]:
topten.loc['2024-09-09']

In [None]:
topten['rank'] = topten.groupby('date').plays.rank(ascending=False)

In [None]:
topten

In [None]:
songs.head()

In [None]:
artists.head()

In [None]:
merged = topten.merge(songs, left_on='song_id', right_index=True) \
               .merge(artists.name, left_on='artist_id', right_index=True)
merged

In [None]:
first = merged.query('rank == 1').groupby('name').title.count().sort_values(ascending=False).head()
first

In [None]:
merged[(merged['rank'] == 1) & (merged['name'] == first.index[0])]

In [None]:
merged.to_hdf('songtables.h5', key='topten')
artists.to_hdf('songtables.h5', key='artists')
songs.to_hdf('songtables.h5', key='songs')

In [None]:
h5 = pd.HDFStore('songtables.h5')

In [None]:
h5.keys()

In [None]:
h5['/artists']