In [1]:
### Load the required packages in the required format
import pandas as pd
import os
import warnings

%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
plt.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore")

##Read the data

In [15]:
### Read the datasets from the given locations and do following
path = "/content/"
filename = "spotify_dataset.csv"
### While Loading datasets we say error_bad_lines = false which drops rows with errors
### As it is experimental project and we have huge datasets, dropping 100-200 Bad rows will not impact any results
print ("Reading the data")
spotify_data = pd.read_csv(os.path.join(path,filename),escapechar='.',error_bad_lines = False,warn_bad_lines=False)
print ("Read Succesful with shape {}".format(spotify_data.shape))
### Columns names were not very clean give them manual names
spotify_data.columns = ['user_id','artistname','trackname','playlistname']

Reading the data
Read Succesful with shape (2203460, 4)


Stats about the data

In [23]:
spotify_data.head()

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [16]:
spotify_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2203460 entries, 0 to 2203459
Data columns (total 4 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   user_id       object
 1   artistname    object
 2   trackname     object
 3   playlistname  object
dtypes: object(4)
memory usage: 67.2+ MB


In [37]:
spotify_data.describe(include ='object')

Unnamed: 0,user_id,artistname,trackname,playlistname
count,2203417,2203417,2203417,2203417
unique,2964,109241,644904,33433
top,99deafd9b792af8e6a535483088faef2,Daft Punk,Starred,Starred
freq,65909,6829,1696,241335


In [20]:
print ("The number of rows in the datasets are as follows :",spotify_data.shape[0])

The number of rows in the datasets are as follows : 2203460


In [21]:
print (" The columns in the data are as follows :",spotify_data.columns)


 The columns in the data are as follows : Index(['user_id', 'artistname', 'trackname', 'playlistname'], dtype='object')


#Data Checks to perform
* Check Missing values
* Check Duplicates
* Check data typ
* Check the number of unique values of each column
* Check statistics of data set
* Check various categories present in the different categorical column

##Handle missing values

In [24]:
spotify_data.isnull().sum()

user_id             0
artistname       5050
trackname          30
playlistname    23604
dtype: int64

Fill missing values with a specific value, such as "Unknown" or "Not available"

In [29]:
spotify_data['artistname'].fillna("Unknown", inplace=True)
spotify_data['trackname'].fillna("Not available", inplace=True)
spotify_data['playlistname'].fillna("Unknown Playlist", inplace=True)

In [30]:
print("Checking for missing values after handling:")
print(spotify_data.isnull().sum())

Checking for missing values after handling:
user_id         0
artistname      0
trackname       0
playlistname    0
dtype: int64


##Checking duplicates

In [33]:
spotify_data.duplicated().sum()

0

##Checking the number of unique values of each column

In [34]:
spotify_data.nunique()

user_id           2964
artistname      109241
trackname       644904
playlistname     33433
dtype: int64

#Exploring Data

In [27]:
spotify_data.head()

Unnamed: 0,user_id,artistname,trackname,playlistname
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [28]:
print("Categories in 'user_id' variable:     ",end=" " )
print(spotify_data['user_id'].unique())

print("Categories in 'artistname' variable:  ",end=" ")
print(spotify_data['artistname'].unique())

print("Categories in 'trackname' variable:",end=" " )
print(spotify_data['trackname'].unique())

print("Categories in 'playlistname' variable:     ",end=" " )
print(spotify_data['playlistname'].unique())

Categories in 'user_id' variable:      ['9cc0cfd4d7d7885102480dd99e7a90d6' '07f0fc3be95dcd878966b1f9572ff670'
 '944c80d26922ae634d6ce445b1fdff7f' ... '4235c4f11386850b79e5c6bff732a494'
 '9d90242856d051f07229bd60b0b7976c' 'd0a7e55e5736f047b541dc7e40d9e529']
Categories in 'artistname' variable:   ['Elvis Costello' 'Elvis Costello & The Attractions' 'Tiffany Page' ...
 'Miriam Ramos' 'Baby Lores' 'Eme Alfonso']
Categories in 'trackname' variable: ['(The Angels Wanna Wear My) Red Shoes'
 "(What's So Funny 'Bout) Peace, Love And Understanding"
 '7 Years Too Late' ... 'Payasas guapas' 'Robbie Williams'
 'Todo lo contrario']
Categories in 'playlistname' variable:      ['HARD ROCK 2010' 'IOW 2012' nan ... 'Eme Alfonso ' 'Feria'
 'First Aid Kit']
