In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import matplotlib.pyplot as plt
%config InlineBackend.figure_format ='retina'

### Load and Read Data 

In [37]:
df = pd.read_csv('../data/df.csv')

In [38]:
df.head(1)

Unnamed: 0,position,track_name,artist,streams,date,region,spotify_id
0,1,Sunflower - Spider-Man: Into the Spider-Verse,Post Malone,1867789.0,2019-01-01,USA,3KkXRkHbMCARz0aVfEt68P


In [39]:
df.shape

(729757, 7)

In [40]:
df.dtypes

position        int64
track_name     object
artist         object
streams       float64
date           object
region         object
spotify_id     object
dtype: object

In [42]:
# Change date column to be datetime dtype
df.set_index(pd.to_datetime(df['date']), inplace = True)

In [43]:
df.drop(columns =['date'], inplace = True)

In [44]:
df.sort_index(inplace = True)

In [45]:
#rank_one =df[df['position'] ==1][['artist', 'track_name', 'streams', 'region', 'spotify_id']]
#rank_one = rank_one.groupby(['Track Name', 'region'])['Streams'].unique()


In [48]:
#rank_one_duration = rank_one.groupby(['artist', 'region']).count()
#rank_one_duration.mean()

# Pre-processing 

In [47]:
def earliest_streams_position(position):
    rank_one_df = df[df['position']==position]
    rank_one_df['artist_track_name'] = rank_one_df['artist'] + rank_one_df['track_name']
    earliest_rank = []
    for artist_track in rank_one_df['artist_track_name'].unique():
        earliest_rank.append(rank_one_df[rank_one_df['artist_track_name'] == artist_track].sort_values('date', ascending = True).head(1))
    earliest_rank_one = pd.concat(earliest_rank)
    earliest_rank_one = earliest_rank_one.drop(columns = 'position')
    earliest_rank_one.head()
    position_duration = rank_one_df.groupby('artist_track_name').sum()[['position']].reset_index()
    earliest_rank_one = pd.merge(earliest_rank_one, position_duration, on='artist_track_name')
    earliest_rank_one.rename(columns = {'position': f'duration_resist_rank_{position}'}, inplace = True)
    return earliest_rank_one

In [49]:
df_modeling1 = earliest_streams_position(1)
df_modeling3 = earliest_streams_position(3)
df_modeling5 = earliest_streams_position(5)
df_modeling10 = earliest_streams_position(10)
df_modeling20 = earliest_streams_position(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
X = df_modeling10[['streams']] # setting up X 
#X.hist(figsize= (8,5));

In [20]:
y = df_modeling10['duration_resist_rank_10'] # setting target 
#y.hist(figsize = (8,5));

In [50]:
# split data into train/test sets 
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   random_state = 42)

In [22]:
X_train.shape

(479, 1)

In [51]:
y_train.shape

(479,)

# Modeling 

In [52]:
# instantiate and fit the model
lm = RandomForestRegressor() 
lm.fit(X_train, y_train)
print(f'Training Score: {round(lm.score(X_train, y_train),4)}.')
print(f'Testing Score: {round(lm.score(X_test, y_test),4)}.')

Training Score: 0.6802.
Testing Score: -0.9231.




This indicates that choosing only one attribute (# of Streams ) is not suffcient to predict the duration of song remain on the current rank. 

### EDA - for initial insights on average duration / streams for songs with rank position 1, 3, 5, 10 , 20 

In [25]:

df1 = pd.DataFrame(df_modeling1[['duration_resist_rank_1', 'streams']].mean())
df1 = df1.T
df1.rename(columns={"streams": "streams_rank1"}, inplace = True)
df1.apply(np.int64)

Unnamed: 0,duration_resist_rank_1,streams_rank1
0,21,916819


In [26]:
df2 = pd.DataFrame(df_modeling3[['duration_resist_rank_3', 'streams']].mean())
df2 = df2.T
df2.rename(columns={"streams": "streams_rank3"}, inplace = True)
df2.apply(np.int64)

Unnamed: 0,duration_resist_rank_3,streams_rank3
0,31,555755


In [27]:
df5 = pd.DataFrame(df_modeling5[['duration_resist_rank_5', 'streams']].mean())
df5 = df5.T
df5.rename(columns={"streams": "streams_rank5"}, inplace = True)
df5.apply(np.int64)

Unnamed: 0,duration_resist_rank_5,streams_rank5
0,39,418941


In [28]:
df5 = pd.DataFrame(df_modeling10[['duration_resist_rank_10', 'streams']].mean())
df5 = df5.T
df5.rename(columns={"streams": "streams_rank5"}, inplace = True)
df5.apply(np.int64)

Unnamed: 0,duration_resist_rank_10,streams_rank5
0,57,316415


In [29]:
df5 = pd.DataFrame(df_modeling20[['duration_resist_rank_20', 'streams']].mean())
df5 = df5.T
df5.rename(columns={"streams": "streams_rank5"}, inplace = True)
df5.apply(np.int64)

Unnamed: 0,duration_resist_rank_20,streams_rank5
0,83,224583


In [30]:
#rank_one_duration = rank_one.groupby(['track_name', 'region']).count()
#rank_one_streams = rank_one.groupby('track_name').sum()
#rank_one_streams

In [31]:
#for example song of Adriane Grande 7 rings stayed on rank 1 for 167 days - 
#song dropped from top chart between 02/8 t0 02/14 but then againg became No1 hit starting on 2/15 
#one_song=rank_one[(rank_one['Track Name'] == '7 rings')]
#one_song['Streams'].plot(figsize =(12,5));
#print('Duration song resists on Rank 1:',len(one_song))