In [1]:
import pandas

In [2]:
pew = pandas.read_csv('../data/pew.csv')


In [3]:
pew.head()

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116


### Reshape with one identifier variables (id_vars) 

In [4]:
pew_long = pandas.melt(pew, id_vars='religion')


In [5]:
pew_long.head()

Unnamed: 0,religion,variable,value
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15


In [7]:
# rename measured variables (value_vars)
pew_long = pandas.melt(pew, id_vars='religion', 
                   var_name='income', value_name='count')


In [8]:
pew_long.head()

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15


### Use more columns as identifier variables (id_vars) 

In [9]:
billboard = pandas.read_csv('../data/billboard.csv')


In [10]:
billboard.head()


Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,...,wk67,wk68,wk69,wk70,wk71,wk72,wk73,wk74,wk75,wk76
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,87,82.0,72.0,77.0,87.0,...,,,,,,,,,,
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,91,87.0,92.0,,,...,,,,,,,,,,
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70.0,68.0,67.0,66.0,...,,,,,,,,,,
3,2000,3 Doors Down,Loser,4:24,2000-10-21,76,76.0,72.0,69.0,67.0,...,,,,,,,,,,
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,57,34.0,25.0,17.0,17.0,...,,,,,,,,,,


In [13]:
billboard_long = pandas.melt(billboard, 
                             id_vars=["year","artist","track","time","date.entered"],
                             var_name='week', value_name="rating")

In [14]:
billboard_long.head()

Unnamed: 0,year,artist,track,time,date.entered,week,rating
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,wk1,91.0
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,wk1,81.0
3,2000,3 Doors Down,Loser,4:24,2000-10-21,wk1,76.0
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,wk1,57.0


In [15]:
billboard.shape

(317, 81)

In [17]:
billboard_long.shape

(24092, 7)

### Selecting subsets and reshaping

In [23]:
billboard_long[billboard_long['track'] == 'Open My Heart'].head()

Unnamed: 0,year,artist,track,time,date.entered,week,rating
9,2000,"Adams, Yolanda",Open My Heart,5:30,2000-08-26,wk1,76.0
326,2000,"Adams, Yolanda",Open My Heart,5:30,2000-08-26,wk2,76.0
643,2000,"Adams, Yolanda",Open My Heart,5:30,2000-08-26,wk3,74.0
960,2000,"Adams, Yolanda",Open My Heart,5:30,2000-08-26,wk4,69.0
1277,2000,"Adams, Yolanda",Open My Heart,5:30,2000-08-26,wk5,68.0


In [24]:
billboard_songs = billboard[['year', 'artist', 'track', 'time']]


In [25]:
billboard_songs.head()

Unnamed: 0,year,artist,track,time
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22
1,2000,2Ge+her,The Hardest Part Of ...,3:15
2,2000,3 Doors Down,Kryptonite,3:53
3,2000,3 Doors Down,Loser,4:24
4,2000,504 Boyz,Wobble Wobble,3:35


In [26]:
billboard_songs['id'] = range(len(billboard_songs))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [27]:
billboard_songs.head()

Unnamed: 0,year,artist,track,time,id
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,1
2,2000,3 Doors Down,Kryptonite,3:53,2
3,2000,3 Doors Down,Loser,4:24,3
4,2000,504 Boyz,Wobble Wobble,3:35,4


### Merge dataframe on common columns

In [28]:
billboard_ratings = billboard_long.merge(
    billboard_songs, on=['year', 'artist', 'track', 'time']
)

In [29]:
billboard_ratings.head()


Unnamed: 0,year,artist,track,time,date.entered,week,rating,id
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0,0
1,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk2,82.0,0
2,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk3,72.0,0
3,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk4,77.0,0
4,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk5,87.0,0


In [30]:
billboard_ratings.shape

(24092, 8)

In [31]:
billboard_ratings = billboard_ratings[['id', 'date.entered', 'week', 'rating']]


In [32]:
billboard_songs.to_csv('../data/billboard_songs.csv', index=False)


In [33]:
billboard_ratings.to_csv('../data/billboard_ratings.csv', index=False)
