In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
from datetime import datetime

In [2]:
stat_df = pd.read_csv('data/statipy_results.csv')
stat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991 entries, 0 to 990
Data columns (total 7 columns):
Unnamed: 0     991 non-null int64
Track          991 non-null object
Explicit       991 non-null bool
Tpopularity    991 non-null int64
Artist         991 non-null object
Album          991 non-null object
Rdate          991 non-null object
dtypes: bool(1), int64(2), object(4)
memory usage: 47.5+ KB


###### i. drops unnamed axis

In [3]:
statidate_i = stat_df.drop('Unnamed: 0', axis=1)

###### ii. changes Rdate to datetime index

In [4]:
statidate_i['Rdate']= pd.DatetimeIndex(statidate_i['Rdate'], freq='infer')

###### iii. rdate index + artsongcnt + songfreq

In [5]:
statidate_i['artsongcnt'] = statidate_i.groupby('Artist')['Track'].transform('count')

In [6]:
statidate_i['songfreq'] = statidate_i.groupby('Track')['Track'].transform('count')

## I. statidate_i 
### rdate index + artsongcnt + songfreq

In [None]:
statidate_i.info()

## II. stat_allfreq
### rdate index + artsongcnt + songfreq + datesongcnt

In [7]:
stat_allfreq = statidate_i.copy()

In [9]:
stat_allfreq['datesongcnt'] = stat_allfreq.groupby('Rdate')['Track'].transform('count')

In [10]:
stat_allfreq.head()

Unnamed: 0,Track,Explicit,Tpopularity,Artist,Album,Rdate,artsongcnt,songfreq,datesongcnt
0,Paradise,False,49,Voyage,Paradise,2018-02-14,1,1,1
1,Tech Noir,False,56,Gunship,GUNSHIP,2015-07-24,1,1,1
2,Yes (Symmetry Remix),False,38,Chromatics,Yes (Love Theme From Lost River),2015-02-03,1,1,2
3,Night,False,39,John Carpenter,Lost Themes,2015-02-03,1,1,2
4,Depth Charge,False,5,Flume,Skin Companion EP II,2017-02-17,1,1,1


## III. statipy_yr
### rdate YEAR + songfreq + artsongcnt + yrsongcnt

In [11]:
statipy_yr = statidate_i.copy()

In [12]:
statipy_yr['Rdate'] = pd.DatetimeIndex(statipy_yr['Rdate']).year

In [14]:
statipy_yr['yrsongcnt'] = statipy_yr.groupby('Rdate')['Track'].transform('count')

In [15]:
statipy_yr.head()

Unnamed: 0,Track,Explicit,Tpopularity,Artist,Album,Rdate,artsongcnt,songfreq,yrsongcnt
0,Paradise,False,49,Voyage,Paradise,2018,1,1,118
1,Tech Noir,False,56,Gunship,GUNSHIP,2015,1,1,61
2,Yes (Symmetry Remix),False,38,Chromatics,Yes (Love Theme From Lost River),2015,1,1,61
3,Night,False,39,John Carpenter,Lost Themes,2015,1,1,61
4,Depth Charge,False,5,Flume,Skin Companion EP II,2017,1,1,142


******************************************
******************************************

# START:
## Clean data - 
    ### stat_allfreq = rdate index + artsongcnt + songfreq + datesongcnt
    ### statipy_yr = rdate year + artsongcnt + song freq + yrsongcnt

# 1.  DATE
**********************************

## GROUP A - yearly
#### Rdate Year full statistics

In [None]:
yearly = statipy_yr.groupby(['Rdate']).mean()

## GROUP B - tr3y, tr2y tr1y
#### Rdate Year most repeat tracks (song freq)

In [None]:
# tr3y = statipy_yr[statipy_yr['songfreq']>=3]
# tr2y = statipy_yr[statipy_yr['songfreq']==2]
# tr1y = statipy_yr[statipy_yr['songfreq']<=1]

In [None]:
# tr3y.groupby(['Rdate'])['Tpopularity'].mean().plot.line(stacked=True, alpha=1)
# tr2y.groupby(['Rdate'])['Tpopularity'].mean().plot.line(stacked=True, alpha=0.50)
# tr1y.groupby(['Rdate'])['Tpopularity'].mean().plot.line(stacked=True, alpha=0.25)

## GROUP C
#### Rdate Year most tracks per year
#### date_i = year-date (most freq - yrsongcnt) / date_max = year-date (top >=15 only - yrsongcnt)

In [None]:
freqdates = statipy_yr[statipy_yr['yrsongcnt']>=8]
maxfreqdates = statipy_yr[statipy_yr['yrsongcnt']>=15]
minfreqdates = statipy_yr[statipy_yr['yrsongcnt']==1]

In [None]:
date_i = freqdates.groupby(['Rdate'])[['Explicit', 'Tpopularity', 'yrsongcnt']].mean()

In [None]:
date_max = maxfreqdates.groupby(['Rdate'])[['Explicit', 'Tpopularity', 'yrsongcnt']].mean()

In [None]:
date_i.plot()

In [None]:
date_max.plot()

## GROUP D
#### Rdate Year stats:: explicit avg x pop avg x yrsongcnt per YR

In [None]:
statipy_yr.groupby('Rdate')['yrsongcnt'].mean().plot.line(stacked=True, grid=True)
statipy_yr.groupby('Rdate')['Explicit'].mean().plot.line(stacked=True, grid=True)
statipy_yr.groupby('Rdate')['Tpopularity'].mean().plot.line(stacked=True, grid=True)

#####  explicit avg only / year

In [None]:
statipy_yr.groupby('Rdate')['Explicit'].mean().plot.line(stacked=True, grid=True)

## GROUP E
#### Rdate Year X yrsongcount 1 (mindate)

In [None]:
mindate = statipy_yr[statipy_yr['yrsongcnt']==1]
mindate.head()

#### Rdate Year X yrsongcount 1 (mindate) / Tpopularity
### high pop | lowpop || tpdm> | lpdm<

In [None]:
toppop_datemin = mindate[mindate['Tpopularity']>=43]
lowpop_datemin = mindate[mindate['Tpopularity']<=43]
tpdm = toppop_datemin
lpdm = lowpop_datemin

In [None]:
scatter plot : avg album popularity / explicit track content x song repeats

In [None]:
#highpop
x = tpdm.groupby('Rdate')['Explicit'].count()
y = tpdm.groupby('Rdate')['Tpopularity'].mean()
s = tpdm.groupby('Rdate')['songfreq'].count()

fig, ax = plt.subplots(figsize=(24, 12))
ax.scatter(x, y, s*100, alpha=0.60);
ax.set_xlabel('Explicit Tracks per Album')
ax.set_ylabel('Avg Popularity per Album')

plt.show()

In [None]:
#lowpop
x = lpdm.groupby('Rdate')['Explicit'].count()
y = lpdm.groupby('Rdate')['Tpopularity'].mean()
s = lpdm.groupby('Rdate')['songfreq'].count()

fig, ax = plt.subplots(figsize=(24, 12))
ax.scatter(x, y, s*100, alpha=0.60);
ax.set_xlabel('Explicit Tracks per Album')
ax.set_ylabel('Avg Popularity per Album')

plt.show()

#### Rdate Year X yrsongcount 1 (mindate) / Tpopularity XX songfreq
#### tpdm> | lpdm< ||  songfreq (repeated track frequency) per year

In [None]:
tpdm.groupby('Rdate')['songfreq'].count().plot.line(stacked=True, alpha=.25)
lpdm.groupby('Rdate')['songfreq'].count().plot.line(stacked=True)

#### Rdate Yr plots albums by date by track popularity over explicit content.

In [None]:
statipy_df['Tpopularity'].mean()

In [None]:
popular_tracks= statipy_df[statipy_df['Tpopularity']>=43.71]
x = popular_tracks.groupby(['Album'])['Tpopularity'].mean()
y = popular_tracks.groupby(['Album'])['Explicit'].count()
s = popular_tracks.groupby(['Album'])['Track'].count()

fig, ax = plt.subplots(figsize=(16, 16))
ax.scatter(y, x, s*100, alpha=0.10);
ax.set_ylabel('Avg Album Popularity')
ax.set_xlabel('Explicit Tracks per Album')

plt.show()

#### Rdate by track popularity over explicit content. with number of tracks to size

In [None]:
x = statipy_df.groupby(['Rdate'])['Tpopularity'].mean()
y = statipy_df.groupby(['Rdate'])[['Explicit']].count()

fig, ax = plt.subplots(figsize=(20, 16))
ax.scatter(x=x, y=y, s=statipy_df['artsongcnt']*5);
ax.set_xlabel('Avg Album Popularity')
ax.set_ylabel('Explicit Tracks per Album')

plt.show()

# 2. ARTISTS
************************************

## GROUP A - artstatsy
#### artists full statistics per clean data 
top - Tpopularity, Explicit, Songfreq

In [None]:
#art_statsy = statipy_yr.groupby(['Artist', 'Rdate']).mean()

# 3. SONGFREQ (#of times song is featured in playlist)
*****************************

## GROUP A
### Divide by song frequency
#### Song Freq = NO_SOLO + SOLO_DF

In [None]:
#no_solo = statipycounts_df[statipycounts_df['songfreq']>=2]
#no_solo.info()

In [None]:
#solo_df = statipycounts_df[statipycounts_df['songfreq']<=1]
#solo_df.info()

## GROUP B
### song frequency X artist tracks (artsongcnt)
#### artist with most non repeat tracks || artist_mostsolo

In [None]:
artist_mostsolo = solo_df[solo_df['artsongcnt']>=5]
artist_mostsolo

###### plot : artist_mostsolo / artsongcnt XX popularity / dates (year)

In [None]:
artist_mostsolo.groupby(['Rdate'])['artsongcnt'].mean().plot()
artist_mostsolo.groupby(['Rdate'])['Tpopularity'].mean().plot()

## GROUP Bb
### artists most solo / explicit tracks
#### pg13artists_ms || artsongcnt_ms

In [None]:
pg13artists_ms = artist_mostsolo[artist_mostsolo['Explicit']==1.0]
xartists_ms = artist_mostsolo[artist_mostsolo['Explicit']==0.0]

####### plot :: Tpopularity XX artsongcnt XX Explicit ?? (level of 0 - 2)

In [None]:
pg13artists_ms[['Tpopularity', 'Explicit', 'artsongcnt']].plot.line()
xartists_ms[['Tpopularity', 'Explicit', 'artsongcnt']].plot.line()

## GROUP C
### SOLO_DF >43 POP only  |  popsolo

In [None]:
#per release date - artists track count // avg track popularity
  popart_ms = artist_mostsolo[artist_mostsolo['Tpopularity']<=43.17]
#plot
  popart_ms.groupby('Rdate')['artsongcnt'].mean().plot(stacked=True)
  popart_ms.groupby('Rdate')['Tpopularity'].mean().plot(grid=True)

In [None]:
### per release date - artists track count // avg track popularity

In [None]:
#plot (unfilled stack step)
ibins = 30
x = popart_ms.Tpopularity
x1 = popart_ms.artsongcnt
colors = ['blue', 'orange', 'green']
plt.hist(x1, ibins, histtype='step', stacked=True, fill=False, label=colors)
plt.hist(x, ibins, histtype='step', stacked=True, fill=False, label=colors)
plt.legend(loc="upper right")
plt.title('Stack Step (unfilled)')
plt.show()

## GROUP D
##### ALL TRACKS POP x EXPLICIT / dates
##### RECALL STATIPY_DF ( + track columns)¶
scatter - popularity / explicit count *** s=statipy_df['artsongcnt']*5*

plot

In [None]:
x = statipy_df.groupby(['Rdate'])['Tpopularity'].mean()
y = statipy_df.groupby(['Rdate'])[['Explicit']].count()

fig, ax = plt.subplots(figsize=(26, 12))
ax.scatter(x=x, y=y, s=statipy_df['artsongcnt']*5);

ax.set_xlabel('Avg Album Popularity')
ax.set_ylabel('Explicit Tracks per Album')

plt.show()

# 4. POPULARITY
*****************************

In [None]:
## GROUP A 
#### statipy_yr pop >43 | popular_tracks
scatter plot s = * track count

In [None]:
popular_tracks= statipy_df[statipy_df['Tpopularity']>=43.71]

#plot
#### scatterplot
x = popular_tracks.groupby(['Album'])['Tpopularity'].mean()
y = popular_tracks.groupby(['Album'])['Explicit'].count()
s = popular_tracks.groupby(['Album'])['Track'].count()

fig, ax = plt.subplots(figsize=(26, 12))
ax.scatter(x, y, s*100, alpha=0.10);
ax.set_xlabel('Avg Album Popularity')
ax.set_ylabel('Explicit Tracks per Album')

plt.show()

In [None]:
## GROUP B 
Most popular Artist w/ most solos
highrank_artistsolo ---- high pop && freq
  popular_solo = solo_df[solo_df['Tpopularity']>=43]
  freq_popularsolo = popular_solo[popular_solo['artsongcnt']>=8]
  highrank_artistsolo = freq_popularsolo.groupby(['Artist'])[['Artist', 'Tpopularity',
 'artsongcnt']].mean()

plot
  highrank_artistsolo.plot.bar()

In [None]:
## GROUP C
ARTIST : HIGH POP, HIGH SONG REPEAT)
GROUP VII
mostfreq_pop ---- high pop && freq
  mostfreq_pop = no_solo[no_solo['songfreq']>3]
  mostfreq_pop.groupby('Artist').mean().plot.bar()


In [None]:
## GROUP D
MOST POP ARTIST BY YEAR GROUP
Narrowing down
  most_popular = statipy_df[statipy_df['Tpopularity']>=43]
  most_popular_y = most_popular[['Track', 'Explicit', 'Tpopularity', 'Artist', 'Rdate', 'artsongcnt', 'songfreq']]
  most_popular_y.head()

  most_popular_y['Rdate'] = pd.DatetimeIndex(most_popular_y['Rdate']).year

  most_popular_y
  most_popular_y['artsongcnt'].mean()
  10.678638941398866

plot
  popart_mostfreq = most_popular_y[most_popular_y['artsongcnt']>=10]
  popart_mostfreq.groupby('Artist')[['Tpopularity', 'artsongcnt', 'Explicit']].mean().plot.bar()


# 5. EXPLICIT CONTENT
*****************************

In [None]:
## GROUP A
#### EXPLICIT x SONG REPEATS

  explicit_tracks = statipy_year[statipy_year['Explicit']==True]
  freqex_tracks = explicit_tracks[explicit_tracks['songfreq']>=5]
  freqex_tracks
plot
  top_freqartex = freqex_tracks.groupby('Artist')[['Tpopularity', 'artsongcnt']].mean()
  top_freqartex.plot.barh(grid=True)

In [None]:
## GROUP B
#### PG13 x SONG REPEATS
  pg_tracks = statipy_year[statipy_year['Explicit']==False]
  freqpg_tracks = pg_tracks[pg_tracks['songfreq']>=3]
  freqpg_tracks

artist pg13 mean (most repeats)
  top_freqartpg = freqpg_tracks.groupby('Artist')[['Tpopularity', 'artsongcnt', 'Explicit']].mean()

plot
  top_freqartpg.plot.barh(grid=True)


In [None]:
## GROUP C
#### artist pg13 mean (most pop)
  pgpop_tracks = pg_tracks[pg_tracks['Tpopularity']>=43]
  pg_art_info = pgpop_tracks.groupby(['Artist', 'songfreq'])[['Tpopularity']]

plot
  pg_art_info.mean().sort_values('songfreq').tail(15).plot.bar()


In [None]:
## GROUP DDDDD
pg13_artists = topartcnt_singles[topartcnt_singles['Explicit']==1.0]
pg13_artists
pg13_artists.plot.line()
- plots popularity artsongcnt over date (explicit should flat line)


ex_artists = topartcnt_singles[topartcnt_singles['Explicit']<=0.0]
ex_artists

ex_artists.plot.line()
- plots explicit artists stats per album date


# 6. DATE II / YEAR
#### YEAR x old / new

*****************************

In [None]:
## GROUP A
#### (oldest dates)

In [None]:
old_tracks = statipy_year.groupby(['Rdate', 'Artist']).mean().head(15)
plot
  old_tracks.plot(kind='barh',stacked=True)

In [None]:
## GROUP B
#### (newest dates)

In [None]:
(newest dates)
  newest_tracks = statipy_year.groupby(['Rdate', 'Artist']).mean().tail(15)

plot
  newest_tracks.plot(kind='barh',stacked=True)

# 7. ARTIST II
*****************************

In [None]:
#artistdf1 = statipy1_df.groupby(['Artist', 'Rdate'])[['Tpopularity', 'artsongcnt',
#'songfreq', 'Explicit']].mean()
#artistdf1
#topartcnt_singles = artistdf1[artistdf1['artsongcnt']>=5]
#topartcnt_single

In [None]:
#- plots artists with more than 5 tracks album stats over dates
#topartcnt_singles.plot.line()

#- plots frequent songs with no repeats popularity over dates
#popsongcount_single.groupby(['Rdate'])['Tpopularity'].mean().plot()

#- plots frequent songs with no repeats artists track count over dates
#popsongcount_single.groupby(['Rdate'])['artsongcnt'].mean().plot()



# 8.
*****************************