In [13]:
#Import dependencies
import pandas as pd

In [14]:
#Importing data
spotify_data = pd.read_csv('https://spotifydataproject.s3.us-east-2.amazonaws.com/Dataset_1.csv')
spotify_data2 = pd.read_csv('https://spotifydataproject.s3.us-east-2.amazonaws.com/Dataset_2.csv')
spotify_data_df = pd.DataFrame(spotify_data)
spotify_data2_df = pd.DataFrame(spotify_data2)


spotify_data_df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,release_date
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814,2009-04-06
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816,2015-10-27
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368,2008-02-05
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227,1998
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39,2017-01-06


In [15]:
#Check dtypes
spotify_data_df.dtypes

genre                object
artist_name          object
track_name           object
track_id             object
popularity            int64
acousticness        float64
danceability        float64
duration_ms           int64
energy              float64
instrumentalness    float64
key                  object
liveness            float64
loudness            float64
mode                 object
speechiness         float64
tempo               float64
time_signature       object
valence             float64
release_date         object
dtype: object

In [16]:
#Converting to datetime
spotify_data_df['release_date'] = pd.to_datetime(spotify_data_df['release_date'], errors='coerce', format='%Y-%m-%d')
spotify_data_df.dtypes

genre                       object
artist_name                 object
track_name                  object
track_id                    object
popularity                   int64
acousticness               float64
danceability               float64
duration_ms                  int64
energy                     float64
instrumentalness           float64
key                         object
liveness                   float64
loudness                   float64
mode                        object
speechiness                float64
tempo                      float64
time_signature              object
valence                    float64
release_date        datetime64[ns]
dtype: object

In [17]:
#Creating year column
spotify_data_df['year'] = pd.DatetimeIndex(spotify_data_df['release_date']).year.fillna(0.0).astype(int)

In [18]:
#Filtering the dataset
filtered_data_df = spotify_data_df[spotify_data_df['year'] >= 2010]
filtered_data_df = filtered_data_df[filtered_data_df['genre'] != 'Movie']
filtered_data_df = filtered_data_df[filtered_data_df['genre'] != 'Classical']
filtered_data_df = filtered_data_df[filtered_data_df['genre'] != 'Opera']
filtered_data_df = filtered_data_df[filtered_data_df['genre'] != 'Anime']
len(filtered_data_df)

131805

In [19]:
#Creating binary popularity measure
bi_popularity = []

for row in filtered_data_df['popularity']:
    if row >= 60:
        bi_popularity.append(1)
    else:
        bi_popularity.append(0)

In [20]:
#Creating popularity dataframe
popularity_df = pd.DataFrame(list(zip(filtered_data_df['track_id'], bi_popularity)), columns=['track_id', 'bi_popularity'])
popularity_df.head()

Unnamed: 0,track_id,bi_popularity
0,6KFaHC9G178beAp7P0Vi5S,1
1,6muW8cSjJ3rusKJ0vH5olw,1
2,7yHqOZfsXYlicyoMt62yC6,1
3,4XzgjxGKqULifVf7mnDIQK,1
4,7KdRu0h7PQ0Ecfa37rUBzW,1


In [21]:
#Adding bi popularity
filtered_data_df['bi_popularity'] = bi_popularity
len(filtered_data_df)

131805

In [22]:
#Dropping duplicates
filtered_data_df = filtered_data_df.drop_duplicates(subset='track_id')
len(filtered_data_df)

94180

In [23]:
#Exporting data
filtered_data_df.to_csv('../../Data/filtered_data.csv', index=False)

In [24]:
filtered_data_df.dtypes

genre                       object
artist_name                 object
track_name                  object
track_id                    object
popularity                   int64
acousticness               float64
danceability               float64
duration_ms                  int64
energy                     float64
instrumentalness           float64
key                         object
liveness                   float64
loudness                   float64
mode                        object
speechiness                float64
tempo                      float64
time_signature              object
valence                    float64
release_date        datetime64[ns]
year                         int64
bi_popularity                int64
dtype: object

In [38]:
len(spotify_data2_df)

174389

In [39]:
#Filtering data 2
filtered_data2_df = spotify_data2_df[spotify_data_df['year'] >= 2010]
filtered_data2_df = spotify_data2_df.drop_duplicates()
filtered_data2_df = spotify_data2_df.drop_duplicates(subset = 'id')
filtered_data2_df = filtered_data2_df.drop(columns= ['artists', 'name'])
len(filtered_data2_df)

172230

In [35]:
#Creating binary popularity measure
bi_popularity2 = []

for row in filtered_data2_df['popularity']:
    if row >= 60:
        bi_popularity2.append(1)
    else:
        bi_popularity2.append(0)

In [36]:
#Add bi popularity
filtered_data2_df['bi_popularity'] = bi_popularity2

In [37]:
#Exporting data 2
filtered_data2_df.to_csv('../../Data/filtered_data2.csv', index=False)