# Songs exploration and analysis
The objective of this notebook is to build a songs dataset composed of 5 dimensions(frequency,popularity,median,variance,mean). This will allow us to dive deeper into our data 

In [82]:
import numpy as np
import pandas as pd

### Loading the dataset

In [60]:
def load_data():
    data = pd.read_csv("data/songsDataset.csv",header = 0,
                      names=["user_id","song_id","rating"],
                      encoding="utf-8")
    return data

### Analytics functions

In [79]:
#count the number of time a song has been rated by users 
#returns a dataframe["song_id","freq"]
def frequency_analytics(data=None):
    if data is not None:
        frequency_df = (data
             .groupby(by = ['song_id'])['rating']
             .count()
             .reset_index()
             .rename(columns = {'rating': 'frequency'})
             [['song_id', 'frequency']])
    
    return frequency_df

#sums the ratings for each song
#returns a dataframe["song_id","popularity"]
def popularity_analytics(data=None):
    if data is not None:
        popularity_df = (data
             .groupby(by = ['song_id'])['rating']
             .sum()
             .reset_index()
             .rename(columns = {'rating': 'popularity'})
             [['song_id', 'popularity']])
        
    return popularity_df


#calculate the mean rating for each song
#returns a dataframe["song_id","mean"]
def mean_analytics(data=None):
    if data is not None:
        mean_df = (data
             .groupby(by = ['song_id'])['rating']
             .mean()
             .reset_index()
             .rename(columns = {'rating': 'mean'})
             [['song_id', 'mean']])
        
    return mean_df

#calculate the std of the rating for each song
#returns a dataframe["song_id","std"]
def std_analytics(data=None):
    if data is not None:
        std_df = (data
             .groupby(by = ['song_id'])['rating']
             .std()
             .reset_index()
             .rename(columns = {'rating': 'std'})
             [['song_id', 'std']])
    return std_df


#calculate the median of the rating for each song
#returns a dataframe["song_id","median"]
def median_analytics(data=None):
    if data is not None:
        median_df = (data
             .groupby(by = ['song_id'])['rating']
             .median()
             .reset_index()
             .rename(columns = {'rating': 'median'})
             [['song_id', 'median']])
        
    return median_df

### Merging all the dataframes

In [80]:
def merge_all(frequency_df,popularity_df,mean_df,std_df,median_df):
    songs_df = (frequency_df.merge(popularity_df,on='song_id')
                .merge(mean_df,on='song_id')
                .merge(std_df,on='song_id')
                .merge(median_df,on='song_id'))
    
    return songs_df

### Main Execution

In [83]:
####------------Loading the data-------------###
data = load_data()

###-------------Performing Analytics------------###
freq = frequency_analytics(data)
pop = popularity_analytics(data)
std = std_analytics(data)
mean = mean_analytics(data)
median = median_analytics(data)


###-------------Merging the dataframes-------------###
songs = merge_all(freq,pop,std,mean,median)

### storing the result in a csv file
songs.to_csv("data/songs.csv", encoding="utf-8")

songs.head(10)

Unnamed: 0,song_id,frequency,popularity,std,mean,median
0,0,5,14,1.788854,2.8,3.0
1,1,2,5,2.12132,2.5,2.5
2,2,4,20,0.0,5.0,5.0
3,3,18,47,1.419979,2.611111,2.5
4,4,2,9,0.707107,4.5,4.5
5,6,2,7,0.707107,3.5,3.5
6,7,9,27,1.802776,3.0,3.0
7,8,4,19,0.5,4.75,5.0
8,9,4,15,1.892969,3.75,4.5
9,10,2,8,1.414214,4.0,4.0
