In [17]:
#Import Packages

import pandas as pd
import numpy as np
import seaborn as sns

import os

%matplotlib inline
pd.set_option('display.max_columns', 500)

# Users

In [2]:
#Import the full data
dfsmall = pd.read_csv("./lastfm-dataset-360k/usersha1-profile.tsv",sep='\t',names=['userId','gender','age','country','signupDate'])
# data = pd.read_csv("./lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv",sep='\t',names=['userId','artistId','artist','plays'])

## Make sure data makes sense

In [3]:
# Gender
dfsmall.gender.describe()

count     326572
unique         2
top            m
freq      241642
Name: gender, dtype: object

In [4]:
# Age
dfsmall.age.describe()

count    284447.000000
mean         25.098046
std          21.665742
min       -1337.000000
25%          20.000000
50%          23.000000
75%          28.000000
max        1002.000000
Name: age, dtype: float64

In [5]:
dfsmall.loc[(dfsmall['age'] < 0) | (dfsmall['age'] > 122),['age']] = np.nan #assign NaN in place of unrealistic values

In [6]:
dfsmall.age.describe()

count    284388.000000
mean         25.365364
std           9.631621
min           1.000000
25%          20.000000
50%          23.000000
75%          28.000000
max         121.000000
Name: age, dtype: float64

In [7]:
#country
dfsmall.country.describe()

count            359347
unique              239
top       United States
freq              67044
Name: country, dtype: object

In [8]:
#signupdate
dfsmall.loc['signupDate'] = pd.to_datetime(dfsmall['signupDate'],infer_datetime_format=True)
dfsmall.signupDate.describe()

count           359347
unique            2298
top       Jan 24, 2008
freq               569
Name: signupDate, dtype: object

### Missing Values
I don't think we have enough information to find the missing values. So, we need to keep the dataframe as it is.

In [9]:
for column in list(dfsmall.columns):
    print ("{}% of the data from {} column is missing".format(round(dfsmall[column].isnull().sum() * 100 / len(dfsmall[column]),2), column))

0.0% of the data from userId column is missing
9.12% of the data from gender column is missing
20.86% of the data from age column is missing
0.0% of the data from country column is missing
0.0% of the data from signupDate column is missing


# Listening Data

In [10]:
#Import the full data
# dfsmall = pd.read_csv("./lastfm-dataset-360k/usersha1-profile.tsv",sep='\t',names=['userId','gender','age','country','signupDate'])
data = pd.read_csv("./lastfm-dataset-360k/usersha1-artmbid-artname-plays.tsv",sep='\t',names=['userId','artistId','artist','plays'])

In [11]:
data[['userId','artistId','artist']].describe()

Unnamed: 0,userId,artistId,artist
count,433920,428363,433920
unique,8875,46553,52939
top,03e5179fca5c4f8fe6cefd95822a24610c830fdd,a74b1b7f-71a5-4011-9441-d0b5e4122711,radiohead
freq,103,1904,1904


In [12]:
data.plays.describe()

count    433919.000000
mean        216.018722
std         602.478277
min           1.000000
25%          34.000000
50%          94.000000
75%         224.000000
max      135392.000000
Name: plays, dtype: float64

## Missing Values

In [13]:
for column in list(data.columns):
    print ("{}% of the data from {} column is missing".format(round(data[column].isnull().sum() * 100 / len(data[column]),2), column))

0.0% of the data from userId column is missing
1.28% of the data from artistId column is missing
0.0% of the data from artist column is missing
0.0% of the data from plays column is missing


In [14]:
# Let's drop the rows where artistId is missing
data = data.loc[data['artistId'].notnull()]

In [15]:
for column in list(data.columns):
    print ("{}% of the data from {} column is missing".format(round(data[column].isnull().sum() * 100 / len(data[column]),2), column))

0.0% of the data from userId column is missing
0.0% of the data from artistId column is missing
0.0% of the data from artist column is missing
0.0% of the data from plays column is missing


## Merge the datasets

In [22]:
# Merge the data
data = pd.merge(data, dfsmall, on='userId', how='left')
print (len(data))
data.head(2)

directory = 'lastfm-dataset-360K-cleaned'

if not os.path.exists(directory):
    os.makedirs(directory)

#Save Cleaned Data
data.to_csv('./lastfm-dataset-360k-cleaned/merged-data.csv')

630492


In [23]:
# Subset

dfsmall = data.iloc[np.random.choice(data.index,size=10000,replace=False)]
print (data.shape)
print (dfsmall.shape)

(630492, 23)
(10000, 23)


In [24]:
directory = 'lastfm-dataset-360k-small'

if not os.path.exists(directory):
    os.makedirs(directory)

dfsmall.to_csv('./lastfm-dataset-360k-small/merged-subset.csv')