# Purpose

This notebook is to process the LastFM-1b UGP dataset to extract the data we are interested in. Go to http://www.cp.jku.at/datasets/LFM-1b/ to download the dataset and extract LFM-1b_UGP.zip into rawData/LFM-1b_UGP

# Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import os

In [2]:
import re
def DropAllNullColumns(data):
    columnsToDrop = []
    for column in data.columns:
        if data[column].isnull().values.all():
            columnsToDrop.append(column)

    print('Dropping all the following columns since all NaN values')
    print(columnsToDrop)
    data.drop(columnsToDrop, axis = 1, inplace = True)
    
def UpperCaseStringColumns(data):
    for column in data.columns:
        if (pd.api.types.infer_dtype(data[column]) == 'string'):
            print(column + ': Upper Casing')
            data[column] = data[column].str.upper()
            
def CompressIntegerColumns(data):
    for column in data.columns:
        if (np.issubdtype(data[column].dtype, np.integer)):       
            minValue = data[column].min()
            maxValue = data[column].max()
            
            info = np.iinfo
            if minValue >= 0:
                types = (np.uint8, np.uint16, np.uint32, np.uint64)
            else:
                types = (np.int8, np.int16, np.int32, np.int64)

            for t in types:
                if info(t).min <= minValue and maxValue <= info(t).max:
                    #print(str(column) + ': Converting to ' + t.__name__)
                    data[column] = data[column].astype(t)
                    break
                
def ConvertFloatColumnsToIntegerIfNoDataLoss(data):
    for column in data.columns:
        try:
            if (np.issubdtype(data[column].dtype, np.float)):
                temp = data[column].astype(np.int64)

                if ((temp == data[column]).all()):
                    print(column + ': Converting to ' + str(temp.dtype))
                    data[column] = temp
        except:
            pass

def ConvertStringColumnsToInt(data):
    for column in data.columns:
        if (pd.api.types.infer_dtype(data[column]) == 'string'):
            if data[column].isnull().values.any():
                continue

            if (data[column].apply(lambda x: re.match('^[0-9,-]+$', x) != None).all()):
                print(column + ': Converting to int')
                data[column] = data[column].str.replace(',', '')
                data[column] = data[column].astype(np.int64) 

def ConvertStringColumnsToFloat(data):
    for column in data.columns:
        if (pd.api.types.infer_dtype(data[column]) == 'string'):
            if data[column].isnull().values.any():
                continue

            if (data[column].apply(lambda x: re.match('^[0-9,-\.]+$', x) != None).all()):
                print(column + ': Converting to float')
                data[column] = data[column].str.replace(',', '')
                data[column] = data[column].astype(np.float64)
                                
def InspectColumnValues(data):
    for column in data.columns:
        try:
            values = data[column].unique()
            print(column + ': ' + str(len(values)))
            print(values[0:10])
            print()
        except:
            print('Error with: ' + column)
            
def SaveData(data, name):
    data = data.reset_index(drop = True)
    data.columns = data.columns.str.replace('_', ' ').str.title()
    if (os.path.exists('../../data/') == False):
        os.makedirs('../../data/')
            
    data.to_parquet('../../data/' + name + '.gzip.parquet', compression = 'gzip', index = False)
    return pd.read_parquet('../../data/' + name + '.gzip.parquet')

## Loading Artists listened to by kids

In [3]:
artists = pd.read_parquet('../../data/LastFM1bKidListeningEventsWithUsers', columns = ['Artist'])
artists = artists['Artist'].cat

## Building Artist to Freebase Genre Map
### Loading Data

In [4]:
with open('LFM-1b_artist_genres_freebase.txt', 'r', encoding='utf-8') as f:
    file_lines = f.read().splitlines()
    
data = pd.DataFrame([string.split('\t') for string in file_lines])

data = data.rename(columns = {0: 'Artist'})
for column in data.columns:
    if (column == 'Artist'):
        continue
        
    data = data.rename(columns = {column: 'Genre'})

print(len(data))
data.head(5)

585055


Unnamed: 0,Artist,Genre,Genre.1,Genre.2,Genre.3,Genre.4,Genre.5,Genre.6,Genre.7,Genre.8,...,Genre.9,Genre.10,Genre.11,Genre.12,Genre.13,Genre.14,Genre.15,Genre.16,Genre.17,Genre.18
0,Megadeth,12,1239,25,1479,50,20,39,,,...,,,,,,,,,,
1,Lil Wayne,248,1181,248,998,1120,108,20,248.0,19.0,...,,,,,,,,,,
2,Foo Fighters,68,33,1287,50,153,1463,1902,56.0,1239.0,...,,,,,,,,,,
3,Porcupine Tree,39,1271,848,20,18,1287,1487,68.0,1239.0,...,,,,,,,,,,
4,No-Man,34,1069,1487,230,848,20,78,1287.0,26.0,...,,,,,,,,,,


### Filtering based on Artist kids listen to

In [5]:
data = data[data['Artist'].str.upper().isin(artists.categories)].reset_index(drop = True)

len(data)

176607

### Shaping Data to [Artist, Genre]

In [6]:
genres = pd.read_csv('genres_freebase.txt', names=['genre'])
genres.head(5)

Unnamed: 0,genre
0,Classic rock
1,Blues
2,Black metal
3,Country
4,Christian alternative rock


In [7]:
len(genres)

1998

In [8]:
genres['genre'].str.upper().nunique()

1925

**NOTE:** Duplicate genres in Freebase

In [9]:
data = data.set_index('Artist').stack().reset_index()
data = data.rename(columns = {0: 'Genre Id'})
data = data[data['Genre Id'] != ''][['Artist', 'Genre Id']].reset_index(drop = True)
ConvertStringColumnsToInt(data)
print(len(data))
data.head(5)

Genre Id: Converting to int
1175500


Unnamed: 0,Artist,Genre Id
0,Megadeth,12
1,Megadeth,1239
2,Megadeth,25
3,Megadeth,1479
4,Megadeth,50


In [10]:
data = data.drop_duplicates().copy()
len(data)

1098367

**NOTE:** There were duplicate rows in the LFM-1b_artist_genres_freebase.txt file

In [11]:
data = data.merge(genres, left_on = 'Genre Id', right_index = True).drop(columns = ['Genre Id'])
UpperCaseStringColumns(data)
len(data)

Artist: Upper Casing
genre: Upper Casing


1098367

### Saving for later use

In [12]:
for column in data.columns:
    data[column] = data[column].astype('category')
    
data = SaveData(data, 'LastFM1bKidArtistToFreebaseGenre')
data.head(5)

Unnamed: 0,Artist,Genre
0,MEGADETH,HEAVY METAL
1,DREAM THEATER,HEAVY METAL
2,AMON AMARTH,HEAVY METAL
3,ALTER BRIDGE,HEAVY METAL
4,LIMP BIZKIT,HEAVY METAL


## Building Artist to All Music Genre Map
### Loading Data

In [13]:
with open('LFM-1b_artist_genres_allmusic.txt', 'r', encoding='utf-8') as f:
    file_lines = f.read().splitlines()
    
data = pd.DataFrame([string.split('\t') for string in file_lines])

data = data.rename(columns = {0: 'Artist'})
for column in data.columns:
    if (column == 'Artist'):
        continue
        
    data = data.rename(columns = {column: 'Genre'})

print(len(data))
data.head(5)

585055


Unnamed: 0,Artist,Genre,Genre.1,Genre.2,Genre.3,Genre.4,Genre.5,Genre.6,Genre.7,Genre.8,Genre.9,Genre.10,Genre.11,Genre.12,Genre.13,Genre.14,Genre.15,Genre.16
0,Megadeth,19,3,,,,,,,,,,,,,,,
1,Lil Wayne,3,18,0.0,,,,,,,,,,,,,,
2,Foo Fighters,16,15,18.0,,,,,,,,,,,,,,
3,Porcupine Tree,3,16,,,,,,,,,,,,,,,
4,No-Man,2,3,16.0,18.0,12.0,,,,,,,,,,,,


### Filtering based on Artist kids listen to

In [14]:
data = data[data['Artist'].str.upper().isin(artists.categories)].reset_index(drop = True)

len(data)

176607

### Shaping Data to [Artist, Genre]


In [15]:
genres = pd.read_csv('genres_allmusic.txt', names=['genre'])
genres.head(5)

Unnamed: 0,genre
0,rnb
1,rap
2,electronic
3,rock
4,new age


In [16]:
data = data.set_index('Artist').stack().reset_index()
data = data.rename(columns = {0: 'Genre Id'})
data = data[data['Genre Id'] != ''][['Artist', 'Genre Id']].reset_index(drop = True)
ConvertStringColumnsToInt(data)
print(len(data))
data.head(5)

Genre Id: Converting to int
223162


Unnamed: 0,Artist,Genre Id
0,Megadeth,19
1,Megadeth,3
2,Lil Wayne,3
3,Lil Wayne,18
4,Lil Wayne,0


In [17]:
data = data.drop_duplicates().copy()
len(data)

219112

**NOTE:** There were duplicate rows in the LFM-1b_artist_genres_allmusic.txt file

In [18]:
data = data.merge(genres, left_on = 'Genre Id', right_index = True).drop(columns = ['Genre Id'])
UpperCaseStringColumns(data)
len(data)

Artist: Upper Casing
genre: Upper Casing


219112

### Saving for later use

In [19]:
for column in data.columns:
    data[column] = data[column].astype('category')

data = SaveData(data, 'LastFM1bKidArtistToAllMusicGenre')
data.head(5)

Unnamed: 0,Artist,Genre
0,MEGADETH,HEAVY METAL
1,DREAM THEATER,HEAVY METAL
2,AMON AMARTH,HEAVY METAL
3,ALTER BRIDGE,HEAVY METAL
4,LIMP BIZKIT,HEAVY METAL
