# KV Project 2: Exploring the billboard Top 100, circa 2000
## Step 1: Data exploration


In [1]:
# Load libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import random
import datetime
import scipy.stats as stats
%matplotlib inline

In [2]:
# Load data into pandas

bb_100 = pd.read_csv("billboard.csv", encoding = "cp1252")
print(bb_100.shape)

(317, 83)


In [3]:
# Explore data

bb_100.head(15)

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,...,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,78,63.0,49.0,...,,,,,,,,,,
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,15,8.0,6.0,...,,,,,,,,,,
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,71,48.0,43.0,...,,,,,,,,,,
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,41,23.0,18.0,...,,,,,,,,,,
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,57,47.0,45.0,...,,,,,,,,,,
5,2000,Janet,Doesn't Really Matter,4:17,Rock,2000-06-17,2000-08-26,59,52.0,43.0,...,,,,,,,,,,
6,2000,Destiny's Child,Say My Name,4:31,Rock,1999-12-25,2000-03-18,83,83.0,44.0,...,,,,,,,,,,
7,2000,"Iglesias, Enrique",Be With You,3:36,Latin,2000-04-01,2000-06-24,63,45.0,34.0,...,,,,,,,,,,
8,2000,Sisqo,Incomplete,3:52,Rock,2000-06-24,2000-08-12,77,66.0,61.0,...,,,,,,,,,,
9,2000,Lonestar,Amazed,4:25,Country,1999-06-05,2000-03-04,81,54.0,44.0,...,,,,,,,,,,


##### Write a brief description of your data, and any interesting observations you've made thus far. 

The dataset contains 317 tracks/singles that made it to the billboard top 100 AROUND the year 2000, along with the artist name, track length and genre. The billboard top 100 provides weekly rankings of singles based on the amount of radio play, online streaming, and physical and digital sales (source: Wikipedia). Given that the data is from circa 2000, one can assume that radio play and physical sales contributed predominantly to the rankings. 

The dataset also includes the dates that a track entered and peaked in the top 100, as well as its position/rank every week for 76 weeks, starting the week that it made it to the top 100.

Here are a couple of observations thus far:

(1) There seem to be a predominance of Rock tracks in the top 100

(2) Tracks seem to have entered the top 100 prior to the first week of Dec 1999, which is the official start of the 2000 chart "year" (Dec 99 1st wk - Nov 00 last wk)  

## Step 2: Data cleaning

##### 2.1: Rename columns

In [4]:
# Rename artist.inverted, time 
bb_100 = bb_100.rename(columns = {'artist.inverted':'artist'})

# Rename weeks columns 
col_list = bb_100.columns
# Get only the weeks columns
col_list_weeks = col_list[7:] 
col_list_new = []
# Create a list of new column names
[col_list_new.append(str(i)) for i in range(1,77)]
new_col_dict = dict(zip(col_list_weeks, col_list_new))
# Rename using the original list and new list of column names
bb_100 = bb_100.rename(columns = new_col_dict)

bb_100.head()

Unnamed: 0,year,artist,track,time,genre,date.entered,date.peaked,1,2,3,...,67,68,69,70,71,72,73,74,75,76
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,78,63.0,49.0,...,,,,,,,,,,
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,15,8.0,6.0,...,,,,,,,,,,
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,71,48.0,43.0,...,,,,,,,,,,
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,41,23.0,18.0,...,,,,,,,,,,
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,57,47.0,45.0,...,,,,,,,,,,


##### 2.2: Date and time manipulations

In [5]:
# Check data types
bb_100.dtypes;

In [6]:
# Calculate time in seconds, and add to table
temp_time = bb_100["time"].str.split(":")
length = len(temp_time)
sec_list = []
[sec_list.append(int(temp_time[i][0])*60 + int(temp_time[i][1])) for i in range(length)]
bb_100["time(sec)"] = sec_list

# Change data type for date.entered and date.peaked
bb_100["date.entered"] = bb_100["date.entered"].astype("datetime64")
bb_100["date.peaked"] = bb_100["date.peaked"].astype("datetime64")

# Calculate time to peak in days
bb_100["Time_to_peak (days)"] = (bb_100["date.peaked"] - bb_100["date.entered"]).astype('timedelta64[D]')

bb_100.dtypes;

##### 2.3: Calculate number of weeks and average ranking

In [7]:
# Find out the number of weeks each track has been on the billboard, and average rating for each track
col_list = bb_100.columns
col_list = col_list[7:-2]
data_weeks = bb_100[col_list]

# Add these quantities to the dataframe
bb_100["num_of_weeks"] = data_weeks.count(axis=1)
bb_100["av_ranking"] = data_weeks.mean(axis=1)

bb_100.head()

Unnamed: 0,year,artist,track,time,genre,date.entered,date.peaked,1,2,3,...,71,72,73,74,75,76,time(sec),Time_to_peak (days),num_of_weeks,av_ranking
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,78,63.0,49.0,...,,,,,,,218,56.0,28,14.821429
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,15,8.0,6.0,...,,,,,,,258,56.0,26,10.5
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,71,48.0,43.0,...,,,,,,,247,98.0,33,17.363636
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,41,23.0,18.0,...,,,,,,,225,35.0,24,13.458333
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,57,47.0,45.0,...,,,,,,,218,70.0,21,19.952381


##### Using Pandas' built in `melt` function, pivot the weekly ranking data to be long rather than wide. As a result, you will have removed the 72 'week' columns and replace it with two: Week and Ranking. There will now be multiple entries for each song, one for each week on the Billboard rankings.

In [8]:
# Replace 

bb_100_cleaned = pd.melt(bb_100, id_vars = ["year", "artist", "track", "time", "genre", 
                                 "date.entered", "date.peaked", "time(sec)", "Time_to_peak (days)", 
                                 "num_of_weeks", "av_ranking"], 
              var_name = "Week", value_name = "Ranking")

bb_100_cleaned.head()

Unnamed: 0,year,artist,track,time,genre,date.entered,date.peaked,time(sec),Time_to_peak (days),num_of_weeks,av_ranking,Week,Ranking
0,2000,Destiny's Child,Independent Women Part I,3:38,Rock,2000-09-23,2000-11-18,218,56.0,28,14.821429,1,78.0
1,2000,Santana,"Maria, Maria",4:18,Rock,2000-02-12,2000-04-08,258,56.0,26,10.5,1,15.0
2,2000,Savage Garden,I Knew I Loved You,4:07,Rock,1999-10-23,2000-01-29,247,98.0,33,17.363636,1,71.0
3,2000,Madonna,Music,3:45,Rock,2000-08-12,2000-09-16,225,35.0,24,13.458333,1,41.0
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),3:38,Rock,2000-08-05,2000-10-14,218,70.0,21,19.952381,1,57.0


## Step 3: Data visualization

In [21]:
# Export dataframe to csv file to facilitate visualization in Tableau

bb_100.to_csv("bb_100_cleaned1.csv")

There are 2 categories of questions that can be visualized using graphs on the cleaned data:

1) Which track and which artist spent the most time in the top 100?

2) What factors correlate to number of weeks spent in the top 100?

##### 3.1 Who and what in the top 100?

Higher, by Creed, spent the most time (57 weeks) in the top 100
<img src="Weeksbytrack.png",width=350,height=250>
Creed's tracks spent the most time in total (104 weeks) in the top 100
<img src="Weeksbyartist.png",width=350,height=250>

##### 3.2 Does anything correlate with number of weeks spent in top 100?

There is no correlation between track length and number of weeks
<img src="Weeksbytracklength.png",width=350,height=250>
While tracks that entered in Q2 and Q3 of 1999 seem to have stayed longer in the top 100, very few tracks [less than 5 out of 317 total] entered the billboard during that period. More data [>30 tracks] from that time period would be helpful to further evaluate this hypothesis.
<img src="Weeksbydate.png",width=350,height=250>
Average time spent in the top 100 seems to be genre dependent, but most genres did not have enough tracks (samples). It may be interesting to explore whether average number of weeks for Rock is siginificantly different than Country or Rap
<img src="Weeksbygenre.png",width=350,height=250>

## Step 4: Problem Statement


Determine whether the average time spent by a Rock track in the top 100 is significantly different than the average time spent by a Country track or a Rap track.

## Step 5: Approach


1) Plot the distribution of number of weeks for Rock, Country and Rap separately

2) If they are not normally distributed, **ASSUME** that the number of weeks for each is a random, independent variable, whose mean will be normal or nearly normal per the Central Limit theorem (number of samples > 50 for each, in this case). 

3) Create arrays for number of weeks for each genre of interest. Check if sample variances are similar or different

4) Use scipy.stats to determine p-values for each pairwise comparison. Use Welch's t-test instead of the Student's t-test if sample variances are very different

5) Use a significance level of 0.05 to determine whether to reject, or fail to reject the null hypothesis

##### 5.1 and 5.2: Plot distributions. See if they are normally distributed


None of the distributions is normal. All of them are skewed positive. Assume means are normally distributed since sample size is >50
#### Rock
<img src="Rock.png",width=300,height=125>
#### Country
<img src="Country.png",width=300,height=125>
#### Rap
<img src="Rap.png",width=300,height=125>

##### 5.3: Create arrays of number of weeks for Rock, Country and Rap

In [14]:
rock = bb_100[bb_100["genre"] == "Rock"]["num_of_weeks"]
country = bb_100[bb_100["genre"] == "Country"]["num_of_weeks"]
rap = bb_100[bb_100["genre"] == "Rap"]["num_of_weeks"]
print("Variances - Rock: {}, Country: {}, Rap: {}".format(np.var(rock), np.var(country), np.var(rap)))

Variances - Rock: 103.18344078000965, Country: 52.14243973703431, Rap: 75.48662306777649


##### 5.4: Run Welch's t-test on each pair

In [17]:
t_rock_country = stats.ttest_ind(rock, country, equal_var=False, nan_policy='omit')
t_rock_rap = stats.ttest_ind(rock, rap, equal_var=False, nan_policy='omit')
t_country_rap = stats.ttest_ind(country, rap, equal_var=False, nan_policy='omit')

##### 5.5: Evaluate p-values to draw conclusions

In [20]:
print("P-values ... Rock-Country: {}, Rock-Rap: {}, Country-Rap: {}".
      format(t_rock_country[1], t_rock_rap[1], t_country_rap[1]))

P-values ... Rock-Country: 0.029174606128373227, Rock-Rap: 0.002512230707036884, Country-Rap: 0.21384007424877874


### Conclusions 

Rock tracks spent higher time in the top 100 on average (18.9 weeks) than Country tracks (16.2 weeks) or Rap tracks (14.4 weeks), circa 2000. Country tracks and Rap tracks spent the same amount of time in the top 100. It would be interesting to see if this trend holds true for other years. It would also be interesting to see if the billboard Top 100 is biased towards Rock tracks (radio stations playing more Rock to cater to certain audiences, for example), resulting in these trends.

## Step 6: Blog Post

## https://karthik-33.github.io/billboard/