In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import seaborn as sns

''' load activity and profile dataframes with parameters '''

In [2]:
activity_data = pd.read_table('usersha1-artmbid-artname-plays.tsv', header = None, 
                names = ['users', 'musicbrainz-artist-id', 'artist-name', 'plays'],
                usecols = ['users', 'artist-name', 'plays'])

In [4]:
profile_data = pd.read_table('usersha1-profile.tsv', header = None, parse_dates=['signup'], index_col=['signup'],
                names = ['users', 'gender', 'age', 'country', 'signup'],
                usecols = ['users', 'gender', 'age', 'country', 'signup'])

''' sort time index of profile df '''

In [6]:
profile_data.sort_index(inplace=True)

''' fill missing age values with mean average '''

In [7]:
profile_data['age'] = profile_data['age'].fillna(round(profile_data['age'].mean()))

''' backfill missing gender values '''

In [8]:
profile_data['gender'].fillna(method='bfill', inplace=True)

''' filter US and Canada users '''

In [9]:
usa_profiles = profile_data.loc[profile_data['country'].isin(['United States', 'Canada'])]

''' remove age outlier data '''

In [10]:
usa_profiles = usa_profiles.query('15 < age < 75')

''' merge dataframes on users column '''

In [79]:
full_df = full_df = pd.merge(activity_data, usa_profiles, on='users')

''' rename columns of merged dataframe '''

In [82]:
full_df.columns = ['users', 'artist', 'plays', 'gender', 'age', 'country']

''' filter user and artist '''

In [85]:
user_artist = full_df[['users','artist']]

In [96]:
user_artist['users'].nunique()

74617

In [97]:
user_artist['artist'].nunique()

121344

''' nested of dictionary of user:artists '''

In [99]:
user_artist_dict = user_artist.groupby('users').artist.apply(list).to_dict()

In [101]:
type(user_artist_dict)

dict

In [102]:
len(user_artist_dict)

74617

''' nested of dictionary of artist:users '''

In [104]:
artist_user_dict = user_artist.groupby('artist').users.apply(list).to_dict()

In [107]:
type(artist_user_dict)

dict

In [105]:
len(artist_user_dict)

121344