### Project 2 Requirements

1. Include a problem statement
2. State the risks and assumptions of your data
3. Import data using the Pandas library
4. Perform exploratory data analysis
5. Use Tableau and/or Python plotting modules to visualize data
6. Observe correlations in the data
7. Evaluate a hypothesis
8. Present results in a polished companion blog post of at least 500 words (& 1-2 graphics!)

### Project 2 Deliverables
1. Jupyter Notebook analysis
2. Blog Post

### Path
* Rename columns:
    * 'date.entered' and 'date.peaked'
    * 'artist.inverted' - maybe
    * 'x_.week'
* Convert data types from dtype object:
    * 'time' to seconds (timedelta)
    * 'date.entered' and 'date.peaked' to dates (timedate)

In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [203]:
df = pd.read_csv('/users/kristensu/dropbox/GA-DSI/DSI-copy/projects/projects-weekly/project-02/assets/billboard.csv')

In [204]:
df.columns

Index([u'year', u'artist.inverted', u'track', u'time', u'genre',
       u'date.entered', u'date.peaked', u'x1st.week', u'x2nd.week',
       u'x3rd.week', u'x4th.week', u'x5th.week', u'x6th.week', u'x7th.week',
       u'x8th.week', u'x9th.week', u'x10th.week', u'x11th.week', u'x12th.week',
       u'x13th.week', u'x14th.week', u'x15th.week', u'x16th.week',
       u'x17th.week', u'x18th.week', u'x19th.week', u'x20th.week',
       u'x21st.week', u'x22nd.week', u'x23rd.week', u'x24th.week',
       u'x25th.week', u'x26th.week', u'x27th.week', u'x28th.week',
       u'x29th.week', u'x30th.week', u'x31st.week', u'x32nd.week',
       u'x33rd.week', u'x34th.week', u'x35th.week', u'x36th.week',
       u'x37th.week', u'x38th.week', u'x39th.week', u'x40th.week',
       u'x41st.week', u'x42nd.week', u'x43rd.week', u'x44th.week',
       u'x45th.week', u'x46th.week', u'x47th.week', u'x48th.week',
       u'x49th.week', u'x50th.week', u'x51st.week', u'x52nd.week',
       u'x53rd.week', u'x54th.week', u'x

In [179]:
df.index

RangeIndex(start=0, stop=317, step=1)

In [180]:
df.head()

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,...,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,"3,38,00 AM",Rock,"September 23, 2000","November 18, 2000",78,63,49,...,*,*,*,*,*,*,*,*,*,*
1,2000,Santana,"Maria, Maria","4,18,00 AM",Rock,"February 12, 2000","April 8, 2000",15,8,6,...,*,*,*,*,*,*,*,*,*,*
2,2000,Savage Garden,I Knew I Loved You,"4,07,00 AM",Rock,"October 23, 1999","January 29, 2000",71,48,43,...,*,*,*,*,*,*,*,*,*,*
3,2000,Madonna,Music,"3,45,00 AM",Rock,"August 12, 2000","September 16, 2000",41,23,18,...,*,*,*,*,*,*,*,*,*,*
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),"3,38,00 AM",Rock,"August 5, 2000","October 14, 2000",57,47,45,...,*,*,*,*,*,*,*,*,*,*


In [181]:
df.dtypes

year                int64
artist.inverted    object
track              object
time               object
genre              object
date.entered       object
date.peaked        object
x1st.week           int64
x2nd.week          object
x3rd.week          object
x4th.week          object
x5th.week          object
x6th.week          object
x7th.week          object
x8th.week          object
x9th.week          object
x10th.week         object
x11th.week         object
x12th.week         object
x13th.week         object
x14th.week         object
x15th.week         object
x16th.week         object
x17th.week         object
x18th.week         object
x19th.week         object
x20th.week         object
x21st.week         object
x22nd.week         object
x23rd.week         object
                    ...  
x47th.week         object
x48th.week         object
x49th.week         object
x50th.week         object
x51st.week         object
x52nd.week         object
x53rd.week         object
x54th.week  

In [205]:
#Clean data step 1: Replace all the '*' values with NaN
def clean_data_1(x):
    if x == '*':
        return np.nan
    else:
        return x

In [206]:
df.applymap(clean_data_1)

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,...,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,"3,38,00 AM",Rock,"September 23, 2000","November 18, 2000",78,63,49,...,,,,,,,,,,
1,2000,Santana,"Maria, Maria","4,18,00 AM",Rock,"February 12, 2000","April 8, 2000",15,8,6,...,,,,,,,,,,
2,2000,Savage Garden,I Knew I Loved You,"4,07,00 AM",Rock,"October 23, 1999","January 29, 2000",71,48,43,...,,,,,,,,,,
3,2000,Madonna,Music,"3,45,00 AM",Rock,"August 12, 2000","September 16, 2000",41,23,18,...,,,,,,,,,,
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),"3,38,00 AM",Rock,"August 5, 2000","October 14, 2000",57,47,45,...,,,,,,,,,,
5,2000,Janet,Doesn't Really Matter,"4,17,00 AM",Rock,"June 17, 2000","August 26, 2000",59,52,43,...,,,,,,,,,,
6,2000,Destiny's Child,Say My Name,"4,31,00 AM",Rock'n'roll,"December 25, 1999","March 18, 2000",83,83,44,...,,,,,,,,,,
7,2000,"Iglesias, Enrique",Be With You,"3,36,00 AM",Latin,"April 1, 2000","June 24, 2000",63,45,34,...,,,,,,,,,,
8,2000,Sisqo,Incomplete,"3,52,00 AM",Rock'n'roll,"June 24, 2000","August 12, 2000",77,66,61,...,,,,,,,,,,
9,2000,Lonestar,Amazed,"4,25,00 AM",Country,"June 5, 1999","March 4, 2000",81,54,44,...,,,,,,,,,,


In [207]:
df['year'].dtypes

dtype('int64')

In [208]:
#Clean data step 2: Convert time into seconds
df_dict = {}
df_dict.update({x: df[x].dtypes for x in df.columns})

In [209]:
df['time'].dtypes

dtype('O')

In [210]:
df['time'] = df['time'].str.replace(',',':')

In [211]:
df['time'].head()

0    3:38:00 AM
1    4:18:00 AM
2    4:07:00 AM
3    3:45:00 AM
4    3:38:00 AM
Name: time, dtype: object

In [189]:
df['time'] = df['time'].str.strip('AM')

In [190]:
df['time'].head()

0    3:38:00 
1    4:18:00 
2    4:07:00 
3    3:45:00 
4    3:38:00 
Name: time, dtype: object

In [191]:
df['time'] = pd.to_timedelta(df['time'], unit = 's')

In [192]:
df['time'].head()

0   03:38:00
1   04:18:00
2   04:07:00
3   03:45:00
4   03:38:00
Name: time, dtype: timedelta64[ns]

In [193]:
#Clean data step 4: Convert column names with '.' character
###Why doesn't this work??
df.replace(df.columns, [lambda x: x.replace('_', '.') for x in df.columns])

Unnamed: 0,year,artist.inverted,track,time,genre,date.entered,date.peaked,x1st.week,x2nd.week,x3rd.week,...,x67th.week,x68th.week,x69th.week,x70th.week,x71st.week,x72nd.week,x73rd.week,x74th.week,x75th.week,x76th.week
0,2000,Destiny's Child,Independent Women Part I,03:38:00,Rock,"September 23, 2000","November 18, 2000",78,63,49,...,*,*,*,*,*,*,*,*,*,*
1,2000,Santana,"Maria, Maria",04:18:00,Rock,"February 12, 2000","April 8, 2000",15,8,6,...,*,*,*,*,*,*,*,*,*,*
2,2000,Savage Garden,I Knew I Loved You,04:07:00,Rock,"October 23, 1999","January 29, 2000",71,48,43,...,*,*,*,*,*,*,*,*,*,*
3,2000,Madonna,Music,03:45:00,Rock,"August 12, 2000","September 16, 2000",41,23,18,...,*,*,*,*,*,*,*,*,*,*
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),03:38:00,Rock,"August 5, 2000","October 14, 2000",57,47,45,...,*,*,*,*,*,*,*,*,*,*
5,2000,Janet,Doesn't Really Matter,04:17:00,Rock,"June 17, 2000","August 26, 2000",59,52,43,...,*,*,*,*,*,*,*,*,*,*
6,2000,Destiny's Child,Say My Name,04:31:00,Rock'n'roll,"December 25, 1999","March 18, 2000",83,83,44,...,*,*,*,*,*,*,*,*,*,*
7,2000,"Iglesias, Enrique",Be With You,03:36:00,Latin,"April 1, 2000","June 24, 2000",63,45,34,...,*,*,*,*,*,*,*,*,*,*
8,2000,Sisqo,Incomplete,03:52:00,Rock'n'roll,"June 24, 2000","August 12, 2000",77,66,61,...,*,*,*,*,*,*,*,*,*,*
9,2000,Lonestar,Amazed,04:25:00,Country,"June 5, 1999","March 4, 2000",81,54,44,...,*,*,*,*,*,*,*,*,*,*


In [194]:
#Clean data step 4: Convert column names with '.' character
df.rename(columns = lambda x: x.replace('.', '_'), inplace=True)

In [195]:
#Clean data step 5: Convert 'date_entered' and 'date_peaked' to datetime

In [196]:
df[['date_entered', 'date_peaked']].head()

Unnamed: 0,date_entered,date_peaked
0,"September 23, 2000","November 18, 2000"
1,"February 12, 2000","April 8, 2000"
2,"October 23, 1999","January 29, 2000"
3,"August 12, 2000","September 16, 2000"
4,"August 5, 2000","October 14, 2000"


In [198]:
df[['date_entered', 'date_peaked']].dtypes

date_entered    object
date_peaked     object
dtype: object

In [200]:
df['date_entered'] = pd.to_datetime(df['date_entered'], infer_datetime_format=True)

In [201]:
df['date_entered'].head()

0   2000-09-23
1   2000-02-12
2   1999-10-23
3   2000-08-12
4   2000-08-05
Name: date_entered, dtype: datetime64[ns]

In [202]:
df.head()

Unnamed: 0,year,artist_inverted,track,time,genre,date_entered,date_peaked,x1st_week,x2nd_week,x3rd_week,...,x67th_week,x68th_week,x69th_week,x70th_week,x71st_week,x72nd_week,x73rd_week,x74th_week,x75th_week,x76th_week
0,2000,Destiny's Child,Independent Women Part I,03:38:00,Rock,2000-09-23,"November 18, 2000",78,63,49,...,*,*,*,*,*,*,*,*,*,*
1,2000,Santana,"Maria, Maria",04:18:00,Rock,2000-02-12,"April 8, 2000",15,8,6,...,*,*,*,*,*,*,*,*,*,*
2,2000,Savage Garden,I Knew I Loved You,04:07:00,Rock,1999-10-23,"January 29, 2000",71,48,43,...,*,*,*,*,*,*,*,*,*,*
3,2000,Madonna,Music,03:45:00,Rock,2000-08-12,"September 16, 2000",41,23,18,...,*,*,*,*,*,*,*,*,*,*
4,2000,"Aguilera, Christina",Come On Over Baby (All I Want Is You),03:38:00,Rock,2000-08-05,"October 14, 2000",57,47,45,...,*,*,*,*,*,*,*,*,*,*
