# Import Packages

In [1]:
import pandas as pd
import numpy as np

# Import Dataset

In [19]:
Hot100 = pd.read_csv('Hot100.csv')

Hot100.head(5)

Unnamed: 0,Date,Title,Artist,Rank,Peak,Weeks
0,2020-02-01,The Box,Roddy Ricch,1,1,7
1,2020-02-01,Life Is Good,Future Featuring Drake,2,2,2
2,2020-02-01,Godzilla,Eminem Featuring Juice WRLD,3,3,1
3,2020-02-01,Circles,Post Malone,4,1,21
4,2020-02-01,Memories,Maroon 5,5,2,18


# Data Exploration

In [20]:
Hot100.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105000 entries, 0 to 104999
Data columns (total 6 columns):
Date      105000 non-null object
Title     105000 non-null object
Artist    105000 non-null object
Rank      105000 non-null int64
Peak      105000 non-null int64
Weeks     105000 non-null int64
dtypes: int64(3), object(3)
memory usage: 4.8+ MB


In [21]:
# Unique Artists
UA = len(Hot100.Artist.value_counts())

# Unique Songs
US = len(Hot100.groupby(['Title', 'Artist']))

print('Number of unique artists on the charts:', UA, '\nNumber of unique songs on the charts:', US)

Number of unique artists on the charts: 3696 
Number of unique songs on the charts: 8034


In [22]:
from datetime import datetime

Change the Date column from object type to datetype.

In [23]:
# Fix this
Hot100['Date'] = pd.to_datetime(Hot100['Date'])
Hot100['Date'] = [date.date() for date in Hot100['Date']]

# Feature Engineering

## Create colums that can give more insight about the data:  
* **PeakPosition** - Highest position the song ranked during its run on the Hot 100 chart  
* **MeanRank** - Average position of the song during its run on the Hot 100 chart  
* **WeeksOnChart** - Total time the song was on the chart regardless of position (in weeks)

In [24]:
# PeakPosition
Hot100['PeakPosition'] = Hot100.groupby(['Title', 'Artist'])['Peak'].transform('min')

# WeeksAtPeak
Hot100['WeeksAtPeak'] = np.where((Hot100['Rank'] == Hot100['PeakPosition']), 1, 0)
Hot100['WeeksAtPeak'] = Hot100.groupby(['Title', 'Artist'])['WeeksAtPeak'].transform(sum)

# MeanRank
Hot100['MeanRank'] = Hot100.groupby(['Title', 'Artist'])['Rank'].transform('mean')

# WeeksOnChart
Hot100['WeeksOnChart'] = Hot100.groupby(['Title', 'Artist'])['Weeks'].transform('max')

Hot100.head(20)

Unnamed: 0,Date,Title,Artist,Rank,Peak,Weeks,PeakPosition,WeeksAtPeak,MeanRank,WeeksOnChart
0,2020-02-01,The Box,Roddy Ricch,1,1,7,1,3,12.0,7
1,2020-02-01,Life Is Good,Future Featuring Drake,2,2,2,2,2,2.0,2
2,2020-02-01,Godzilla,Eminem Featuring Juice WRLD,3,3,1,3,1,3.0,1
3,2020-02-01,Circles,Post Malone,4,1,21,1,3,3.857143,21
4,2020-02-01,Memories,Maroon 5,5,2,18,2,1,9.444444,18
5,2020-02-01,"10,000 Hours",Dan + Shay & Justin Bieber,6,4,16,4,1,9.6875,16
6,2020-02-01,Dance Monkey,Tones And I,7,7,16,7,3,25.9375,16
7,2020-02-01,Someone You Loved,Lewis Capaldi,8,1,37,1,3,20.297297,37
8,2020-02-01,Roxanne,Arizona Zervas,9,4,12,4,1,11.083333,12
9,2020-02-01,Lose You To Love Me,Selena Gomez,10,1,14,1,1,9.071429,14


In [29]:
Hot100['WeeksAtPeak'].describe()

count    105000.000000
mean          1.714533
std           1.623189
min           0.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          19.000000
Name: WeeksAtPeak, dtype: float64

In [34]:
Hot100[Hot100['Artist'] == 0]

Unnamed: 0,Date,Title,Artist,Rank,Peak,Weeks,PeakPosition,WeeksAtPeak,MeanRank,WeeksOnChart
441,2020-01-04,White Christmas,Bing Crosby,42,12,16,12,0,41.333333,16
1243,2019-11-09,Thriller,Michael Jackson,44,4,19,4,0,39.400000,19
5633,2019-01-05,White Christmas,Bing Crosby,34,12,15,12,0,41.333333,16
5747,2018-12-29,White Christmas,Bing Crosby,48,12,14,12,0,41.333333,16
6149,2018-12-01,Bohemian Rhapsody,Queen,50,2,44,2,0,41.000000,44
...,...,...,...,...,...,...,...,...,...,...
104990,2000-01-01,Give You What You Want (Fa Sure),Chico DeBarge,91,71,11,71,0,90.666667,13
104991,2000-01-01,Music Of My Heart,'N Sync & Gloria Estefan,92,2,18,2,0,94.333333,20
104996,2000-01-01,Gotta Man,Eve,97,26,16,26,0,97.000000,16
104998,2000-01-01,Steam,Ty Herndon,99,83,12,83,0,99.000000,12


Create a new column **Hot100['DateAtPeak']** that will input the most recent date that the song was at its peak.
Some songs move up and down the charts so we want to observe the most recent date that it was at its peak position (e.g. Mariah Carey - All I Want For Christmas frequently climbs the chart during the Christmas season).

groupby(['Title', 'Artist'] since some duplicate titles in 2 different songs
Will eventually get rid of non-unique records (each song will appear once in the dataframe after all feature engineering has happened)


In [28]:
indices = Hot100.groupby(['Title', 'Artist'])['Peak'].idxmin
Hot100 = Hot100.loc[indices].sort_index()
Hot100.reset_index(drop = True, inplace = True)


Hot100.info()
Hot100.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8034 entries, 0 to 8033
Data columns (total 9 columns):
Date            8034 non-null object
Title           8034 non-null object
Artist          8034 non-null object
Rank            8034 non-null int64
Peak            8034 non-null int64
Weeks           8034 non-null int64
PeakPosition    8034 non-null int64
MeanRank        8034 non-null float64
WeeksOnChart    8034 non-null int64
dtypes: float64(1), int64(5), object(3)
memory usage: 565.0+ KB


Unnamed: 0,Date,Title,Artist,Rank,Peak,Weeks,PeakPosition,MeanRank,WeeksOnChart
0,2020-02-01,The Box,Roddy Ricch,1,1,7,1,12.0,7
1,2020-02-01,Life Is Good,Future Featuring Drake,2,2,2,2,2.0,2
2,2020-02-01,Godzilla,Eminem Featuring Juice WRLD,3,3,1,3,3.0,1
3,2020-02-01,Circles,Post Malone,4,1,21,1,3.857143,21
4,2020-02-01,Memories,Maroon 5,5,2,18,2,9.444444,18
5,2020-02-01,"10,000 Hours",Dan + Shay & Justin Bieber,6,4,16,4,9.6875,16
6,2020-02-01,Dance Monkey,Tones And I,7,7,16,7,25.9375,16
7,2020-02-01,Someone You Loved,Lewis Capaldi,8,1,37,1,20.297297,37
8,2020-02-01,Roxanne,Arizona Zervas,9,4,12,4,11.083333,12
9,2020-02-01,Lose You To Love Me,Selena Gomez,10,1,14,1,9.071429,14


To tidy things up a little bit, I can remove colums that I no longer need in the analysis. The following columns will be dropped:  
   * Rank  
   * Peak
   * Weeks

In [29]:
Hot100.drop(['Rank', 'Peak', 'Weeks'], axis = 1, inplace = True)

Hot100.head(5)

Unnamed: 0,Date,Title,Artist,PeakPosition,MeanRank,WeeksOnChart
0,2020-02-01,The Box,Roddy Ricch,1,12.0,7
1,2020-02-01,Life Is Good,Future Featuring Drake,2,2.0,2
2,2020-02-01,Godzilla,Eminem Featuring Juice WRLD,3,3.0,1
3,2020-02-01,Circles,Post Malone,1,3.857143,21
4,2020-02-01,Memories,Maroon 5,2,9.444444,18


In [30]:
Hot100.head(200)

Unnamed: 0,Date,Title,Artist,PeakPosition,MeanRank,WeeksOnChart
0,2020-02-01,The Box,Roddy Ricch,1,12.000000,7
1,2020-02-01,Life Is Good,Future Featuring Drake,2,2.000000,2
2,2020-02-01,Godzilla,Eminem Featuring Juice WRLD,3,3.000000,1
3,2020-02-01,Circles,Post Malone,1,3.857143,21
4,2020-02-01,Memories,Maroon 5,2,9.444444,18
...,...,...,...,...,...,...
195,2019-12-14,Who Needs Love,Trippie Redd,58,70.000000,2
196,2019-12-14,Follow God,Kanye West,7,50.666667,6
197,2019-12-14,F.N,Lil Tjay,56,78.785714,14
198,2019-12-14,Into The Unknown,Panic! At The Disco,98,98.500000,2


In [48]:
Hot100['Artist'].value_counts().sort_values(ascending = False)[:25]

Glee Cast           183
Drake                90
Taylor Swift         84
Eminem               41
Kenny Chesney        40
Justin Bieber        38
Keith Urban          36
Tim McGraw           36
Rascal Flatts        35
Toby Keith           34
Kanye West           33
Beyonce              33
Brad Paisley         32
Future               32
Jason Aldean         31
Carrie Underwood     30
The Weeknd           30
Lil Wayne            30
One Direction        29
Blake Shelton        29
J. Cole              29
Luke Bryan           28
George Strait        28
Chris Brown          28
Ariana Grande        28
Name: Artist, dtype: int64

In [46]:
Hot100.groupby('Artist')['WeeksOnChart'].sum().sort_values(ascending = False)[:25]

Artist
Taylor Swift           935
Drake                  707
Kenny Chesney          681
Keith Urban            653
Rascal Flatts          602
Tim McGraw             585
Rihanna                566
P!nk                   560
Brad Paisley           546
Carrie Underwood       528
Toby Keith             524
Jason Aldean           522
Kelly Clarkson         501
Blake Shelton          501
Beyonce                490
Maroon 5               478
Luke Bryan             475
George Strait          459
Chris Brown            427
Nickelback             425
Dierks Bentley         418
Imagine Dragons        409
Katy Perry             387
The Black Eyed Peas    382
Bruno Mars             381
Name: WeeksOnChart, dtype: int64

In [75]:
Hot100(Hot100['Rank'] == Hot100['PeakPosition'])

#Hot100.head(10)


#.map(Hot100.groupby(['Title', 'Artist'])['Rank'].value_counts())
               
               #== Hot100['PeakPosition']))


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/ZEE/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-75-559a9b1c9d5e>", line 1, in <module>
    Hot100(Hot100['Rank'] == Hot100['PeakPosition'])
TypeError: 'DataFrame' object is not callable

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/ZEE/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2034, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'TypeError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/ZEE/anaconda3/lib/python3.7/site-packages/IPython/core/ultratb.py", line 1151, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/

TypeError: 'DataFrame' object is not callable

In [33]:
Hot100.head(10)

Unnamed: 0,Date,Title,Artist,Rank,Peak,Weeks,PeakPosition,MeanRank,WeeksOnChart,WeeksAtPeak
0,2020-02-01,The Box,Roddy Ricch,1,1,7,1,12.0,7,1050
1,2020-02-01,Life Is Good,Future Featuring Drake,2,2,2,2,2.0,2,1050
2,2020-02-01,Godzilla,Eminem Featuring Juice WRLD,3,3,1,3,3.0,1,1050
3,2020-02-01,Circles,Post Malone,4,1,21,1,3.857143,21,1050
4,2020-02-01,Memories,Maroon 5,5,2,18,2,9.444444,18,1050
5,2020-02-01,"10,000 Hours",Dan + Shay & Justin Bieber,6,4,16,4,9.6875,16,1050
6,2020-02-01,Dance Monkey,Tones And I,7,7,16,7,25.9375,16,1050
7,2020-02-01,Someone You Loved,Lewis Capaldi,8,1,37,1,20.297297,37,1050
8,2020-02-01,Roxanne,Arizona Zervas,9,4,12,4,11.083333,12,1050
9,2020-02-01,Lose You To Love Me,Selena Gomez,10,1,14,1,9.071429,14,1050


In [51]:
a = Hot100[Hot100['PeakPosition'] == 1]

In [52]:
a = a[a['WeeksAtPeak'] > 0]

In [72]:
a.groupby(['Title', 'Artist'])['WeeksAtPeak'].max().sort_values(ascending = False)[:50]

Title                         Artist                                           
Old Town Road                 Lil Nas X Featuring Billy Ray Cyrus                  19
Despacito                     Luis Fonsi & Daddy Yankee Featuring Justin Bieber    16
We Belong Together            Mariah Carey                                         14
Uptown Funk!                  Mark Ronson Featuring Bruno Mars                     14
I Gotta Feeling               The Black Eyed Peas                                  14
Boom Boom Pow                 The Black Eyed Peas                                  12
Shape Of You                  Ed Sheeran                                           12
Blurred Lines                 Robin Thicke Featuring T.I. + Pharrell               12
Closer                        The Chainsmokers Featuring Halsey                    12
Lose Yourself                 Eminem                                               12
Yeah!                         Usher Featuring Lil Jon & Luda

In [66]:
a.head(200)

Unnamed: 0,Date,Title,Artist,Rank,Peak,Weeks,PeakPosition,WeeksAtPeak,MeanRank,WeeksOnChart
0,2020-02-01,The Box,Roddy Ricch,1,1,7,1,3,12.000000,7
3,2020-02-01,Circles,Post Malone,4,1,21,1,3,3.857143,21
7,2020-02-01,Someone You Loved,Lewis Capaldi,8,1,37,1,3,20.297297,37
9,2020-02-01,Lose You To Love Me,Selena Gomez,10,1,14,1,1,9.071429,14
16,2020-02-01,HIGHEST IN THE ROOM,Travis Scott,17,1,16,1,1,15.375000,16
...,...,...,...,...,...,...,...,...,...,...
1918,2019-09-21,Sucker,Jonas Brothers,19,1,28,1,1,14.238095,42
1939,2019-09-21,Without Me,Halsey,40,1,49,1,2,12.250000,52
2000,2019-09-14,Truth Hurts,Lizzo,1,1,18,1,7,13.921053,38
2001,2019-09-14,Senorita,Shawn Mendes & Camila Cabello,2,1,11,1,1,7.483871,31
