In [2]:
import numpy as np
import pandas as pd

# Exploratory Data Analysis (EDA)

## Songs
- There are 15 songs with no title

In [3]:
df_song = pd.read_pickle('data/song_cleaned.pkl')
df_song.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999056 entries, 0 to 999055
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   song_num     999056 non-null  int32 
 1   title        999041 non-null  object
 2   release      999056 non-null  object
 3   artist_name  999056 non-null  object
 4   year         999056 non-null  int32 
dtypes: int32(2), object(3)
memory usage: 30.5+ MB


### Categorical columns

In [4]:
df_song.describe(include='object')

Unnamed: 0,title,release,artist_name
count,999041,999056,999056
unique,702834,149689,72656
top,Intro,Greatest Hits,Johnny Cash
freq,1510,1989,191


### Column: Year
- should be treated as categorical
- 0 is a missing value, about 48% of the data

In [5]:
df_song.describe()

Unnamed: 0,song_num,year
count,999056.0,999056.0
mean,499528.5,1029.754741
std,288402.769605,998.760386
min,1.0,0.0
25%,249764.75,0.0
50%,499528.5,1969.0
75%,749292.25,2002.0
max,999056.0,2011.0


In [6]:
year_zero = df_song[df_song['year'] == 0]
year_zero.shape[0], year_zero.shape[0] / df_song.shape[0]

(484251, 0.48470856488525166)

In [7]:
df_clean = df_song[df_song['year'] != 0]

## Song Plays

In [10]:
df_play = pd.read_pickle('data/play_cleaned.pkl')
df_play.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 3 columns):
 #   Column      Dtype
---  ------      -----
 0   user_num    int32
 1   song_num    int32
 2   play_count  int64
dtypes: int32(2), int64(1)
memory usage: 30.5 MB


### Object columns
- Object columns needs to be converted to ordinal (integer index)

In [11]:
df_play.describe()

Unnamed: 0,user_num,song_num,play_count
count,2000000.0,2000000.0,2000000.0
mean,38409.45,499977.4,3.045485
std,22054.88,287595.4,6.57972
min,1.0,218.0,1.0
25%,19244.0,251146.0,1.0
50%,38596.0,504954.0,1.0
75%,57587.0,753796.0,3.0
max,76353.0,998962.0,2213.0


### Column: Play Count
- No zero values, but there are some outliers

In [12]:
df_play['play_count'].describe(percentiles=[0.5, 0.75, 0.9, 0.95, 0.99])

count    2.000000e+06
mean     3.045485e+00
std      6.579720e+00
min      1.000000e+00
50%      1.000000e+00
75%      3.000000e+00
90%      6.000000e+00
95%      1.000000e+01
99%      2.600000e+01
max      2.213000e+03
Name: play_count, dtype: float64

In [13]:
# Count Outliers
zscore_threshold = df_play['play_count'].mean() + 3 * df_play['play_count'].std()
iq_threshold = df_play['play_count'].quantile(0.99)
print(zscore_threshold, iq_threshold)
df_play[df_play.play_count > max(zscore_threshold, iq_threshold)].count()

22.78464405048612 26.0


user_num      18995
song_num      18995
play_count    18995
dtype: int64

## Imput outliers in play_count

In [20]:
df_play_clean = df_play.copy()
df_play_clean.loc[:, 'play_count'] = df_play_clean['play_count'].clip(upper=max(zscore_threshold, iq_threshold))

## Save data frames to pickle files

In [17]:
#df_song_clean.to_pickle('./data/songs.pkl')
#df_play_clean.to_pickle('./data/plays.pkl')