## Análisis Descriptivo: ¿Quién llegará al número uno en Billboard 100?

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')

In [None]:
pwd

## Cargamos los datos de entrada

In [None]:
artists_billboard=pd.read_csv('artists_billboard_fix3.csv',sep = ",",encoding = 'latin1')

In [None]:
artists_billboard.shape

In [None]:
artists_billboard.head()

In [None]:
artists_billboard.info()

## ¿Cuántos alcanzaron el número 1?

In [None]:
artists_billboard.groupby('top').size()

In [None]:
artists_billboard['top'].value_counts(1)

In [None]:
# sns.factorplot('top',data=artists_billboard,kind="count")
sns.countplot(artists_billboard,x='top')

In [None]:
artists_billboard.describe()

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(x="artist_type",y="durationSeg", hue="top", data=artists_billboard)

In [None]:
def outlier_capping(x):
    x = x.clip(upper=x.quantile(0.95))

    return(x)

In [None]:
artists_billboard_tratamiento = artists_billboard[['durationSeg']].apply(lambda x: outlier_capping(x))

In [None]:
artists_billboard_tratamiento.head(3)

In [None]:
artists_billboard = pd.concat([artists_billboard.drop(columns=['durationSeg']), artists_billboard_tratamiento], axis=1,)

In [None]:
artists_billboard.head(5)

In [None]:
artists_billboard.describe()

In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(x="artist_type",y="durationSeg", hue="top", data=artists_billboard)

In [None]:
plt.figure(figsize=(16,4))
sns.boxplot(x="genre",y="durationSeg", hue="top", data=artists_billboard)

In [None]:
plt.figure(figsize=(20,4))
sns.boxplot(x="mood",y="durationSeg", hue="top", data=artists_billboard)

### Visualizamos los años de nacimiento de los artistas

In [None]:
plt.figure(figsize=(20,4))
#sns.factorplot('anioNacimiento',data=artists_billboard,kind="count", aspect=3)
sns.countplot(x='anioNacimiento',data=artists_billboard)

In [None]:
# prompt: tengo este codigo: sns.countplot(x='anioNacimiento',data=artists_billboard) quiero que el eje X sus etiquetas se pongan en orietnación vertical

plt.xticks(rotation=90)
sns.countplot(x='anioNacimiento',data=artists_billboard)


In [None]:
# prompt: quiero crear un grafico de barras horizontales de la variable anio de nacimiento

sns.barplot(x="anioNacimiento", y="top", data=artists_billboard, orient="h")


In [None]:
artists_billboard['anioNacimiento'].value_counts().plot(kind='barh')

### Calculamos promedio de edad y asignamos a los registros Nulos

In [None]:
artists_billboard.head(2)

In [None]:
def calcula_edad(anio,cuando):
    cad = str(cuando)
    momento = cad[:4]
    if anio==0.0:
        return None
    return int(momento) - anio

artists_billboard['edad_en_billboard']=artists_billboard.apply(lambda x: calcula_edad(x['anioNacimiento'],x['chart_date']), axis=1);

In [None]:
artists_billboard.head()

In [None]:
artists_billboard.describe()

In [None]:
def edad_fix(anio):
    if anio==0:
        return None
    return anio

artists_billboard['anioNacimiento']=artists_billboard.apply(lambda x: edad_fix(x['anioNacimiento']), axis=1);

In [None]:
artists_billboard['edad_en_billboard'].isnull().sum()

In [None]:
age_avg = artists_billboard['edad_en_billboard'].mean()
age_std = artists_billboard['edad_en_billboard'].std()
age_null_count = artists_billboard['edad_en_billboard'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)

conValoresNulos = np.isnan(artists_billboard['edad_en_billboard'])

artists_billboard.loc[np.isnan(artists_billboard['edad_en_billboard']), 'edad_en_billboard'] = age_null_random_list

artists_billboard['edad_en_billboard'] = artists_billboard['edad_en_billboard'].astype(int)

print("Edad Promedio: " + str(age_avg))
print("Desvió Std Edad: " + str(age_std))
print("Intervalo para asignar edad aleatoria: " + str(int(age_avg - age_std)) + " a " + str(int(age_avg + age_std)))

In [None]:
artists_billboard.describe()

In [None]:
#sns.factorplot('anioNacimiento',data=artists_billboard,kind="count", aspect=3)

# Comparemos los Top y los No-top

### Buscamos si hay alguna relación evidente entre Año y duración de Canción

In [None]:
artists_billboard.head(3)

In [None]:
artists_billboard.describe()

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(x="artist_type",y="edad_en_billboard", hue="top", data=artists_billboard)

## Visualicemos los Atributos de entrada

In [None]:
#sns.factorplot('artist_type',data=artists_billboard,kind="count")
sns.countplot(x='artist_type',data=artists_billboard)
artists_billboard['artist_type'].value_counts(1)

In [None]:
#sns.factorplot('top',data=artists_billboard,hue='artist_type',kind="count")
sns.countplot(x='top',data=artists_billboard,hue='artist_type')
artists_billboard[['artist_type','top']].groupby(['artist_type'], as_index=False).agg(['count','sum','mean'])

In [None]:
sns.countplot(x='mood',data=artists_billboard)
artists_billboard['mood'].value_counts(1)

In [None]:
#sns.factorplot('top',data=artists_billboard,hue='mood',kind="count")
sns.countplot(x='top',data=artists_billboard,hue='mood')
artists_billboard[['mood','top']].groupby(['mood'], as_index=False).agg(['count','sum','mean'])

In [None]:
#sns.factorplot('tempo',data=artists_billboard,hue='top',kind="count")
sns.countplot(x='tempo',data=artists_billboard,hue='top')
artists_billboard[['tempo','top']].groupby(['tempo'], as_index=False).agg(['count','sum','mean'])

In [None]:
#sns.factorplot('genre',data=artists_billboard,kind="count", aspect=3)
sns.countplot(x='genre',data=artists_billboard)
artists_billboard[['genre','top']].groupby(['genre'], as_index=False).agg(['count','sum','mean'])

In [None]:
# Mood Mapping
artists_billboard['moodEncoded'] = artists_billboard['mood'].map( {'Energizing': 6,
                                        'Empowering': 6,
                                        'Cool': 5,
                                        'Yearning': 4, # anhelo, deseo, ansia
                                        'Excited': 5, #emocionado
                                        'Defiant': 3,
                                        'Sensual': 2,
                                        'Gritty': 3, #coraje
                                        'Sophisticated': 4,
                                        'Aggressive': 4, # provocativo
                                        'Fiery': 4, #caracter fuerte
                                        'Urgent': 3,
                                        'Rowdy': 4, #ruidoso alboroto
                                        'Sentimental': 4,
                                        'Easygoing': 1, # sencillo
                                        'Melancholy': 4,
                                        'Romantic': 2,
                                        'Peaceful': 1,
                                        'Brooding': 4, # melancolico
                                        'Upbeat': 5, #optimista alegre
                                        'Stirring': 5, #emocionante
                                        'Lively': 5, #animado
                                        'Other': 0,'':0} ).astype(int)

In [None]:
# Tempo Mapping
artists_billboard['tempoEncoded'] = artists_billboard['tempo'].map( {'Fast Tempo': 0, 'Medium Tempo': 2, 'Slow Tempo': 1, '': 0} ).astype(int)

In [None]:
# Genre Mapping
artists_billboard['genreEncoded'] = artists_billboard['genre'].map( {'Urban': 4,
                                          'Pop': 3,
                                          'Traditional': 2,
                                          'Alternative & Punk': 1,
                                         'Electronica': 1,
                                          'Rock': 1,
                                          'Soundtrack': 0,
                                          'Jazz': 0,
                                          'Other':0,'':0}
                                       ).astype(int)

In [None]:
# artist_type Mapping
artists_billboard['artist_typeEncoded'] = artists_billboard['artist_type'].map( {'Female': 2, 'Male': 3, 'Mixed': 1, '': 0} ).astype(int)


In [None]:
# Mapping edad en la que llegaron al billboard
artists_billboard.loc[ artists_billboard['edad_en_billboard'] <= 21, 'edadEncoded'] 					       = 0
artists_billboard.loc[(artists_billboard['edad_en_billboard'] > 21) & (artists_billboard['edad_en_billboard'] <= 26), 'edadEncoded'] = 1
artists_billboard.loc[(artists_billboard['edad_en_billboard'] > 26) & (artists_billboard['edad_en_billboard'] <= 30), 'edadEncoded'] = 2
artists_billboard.loc[(artists_billboard['edad_en_billboard'] > 30) & (artists_billboard['edad_en_billboard'] <= 40), 'edadEncoded'] = 3
artists_billboard.loc[ artists_billboard['edad_en_billboard'] > 40, 'edadEncoded'] = 4

In [None]:
# Mapping Song Duration
artists_billboard.loc[ artists_billboard['durationSeg'] <= 150, 'durationEncoded'] 					       = 0
artists_billboard.loc[(artists_billboard['durationSeg'] > 150) & (artists_billboard['durationSeg'] <= 180), 'durationEncoded'] = 1
artists_billboard.loc[(artists_billboard['durationSeg'] > 180) & (artists_billboard['durationSeg'] <= 210), 'durationEncoded'] = 2
artists_billboard.loc[(artists_billboard['durationSeg'] > 210) & (artists_billboard['durationSeg'] <= 240), 'durationEncoded'] = 3
artists_billboard.loc[(artists_billboard['durationSeg'] > 240) & (artists_billboard['durationSeg'] <= 270), 'durationEncoded'] = 4
artists_billboard.loc[(artists_billboard['durationSeg'] > 270) & (artists_billboard['durationSeg'] <= 300), 'durationEncoded'] = 5
artists_billboard.loc[ artists_billboard['durationSeg'] > 300, 'durationEncoded'] = 6

In [None]:
artists_billboard.head()

In [None]:
drop_elements = ['id','title','artist','mood','tempo','genre','artist_type','chart_date','anioNacimiento','durationSeg','edad_en_billboard']
artists_encoded = artists_billboard.drop(drop_elements, axis = 1)

### Analizamos nuestros datos de Entrada Categóricos

In [None]:
artists_encoded.head(10)

In [None]:
artists_encoded.describe()

In [None]:
artists_encoded.head(1)

In [None]:
colormap = plt.cm.viridis
plt.figure(figsize=(6,6))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(artists_encoded.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
from sklearn.feature_selection import SelectKBest

In [None]:
X=artists_encoded.drop(['top'], axis=1)
y=artists_encoded['top']

best=SelectKBest(k=3)
X_new = best.fit_transform(X, y)
#X_new.shape
selected = best.get_support(indices=True)
print(X.columns[selected])

In [None]:
artists_encoded[['genreEncoded', 'top']].groupby(['genreEncoded'], as_index=False).agg(['mean', 'count', 'sum'])

In [None]:
artists_encoded[['artist_typeEncoded', 'top']].groupby(['artist_typeEncoded'], as_index=False).agg(['mean', 'count', 'sum'])

In [None]:
artists_encoded[['genreEncoded', 'top']].groupby(['genreEncoded'], as_index=False).agg(['mean', 'count', 'sum'])

In [None]:
artists_encoded[['tempoEncoded', 'top']].groupby(['tempoEncoded'], as_index=False).agg(['mean', 'count', 'sum'])

In [None]:
artists_encoded[['durationEncoded', 'top']].groupby(['durationEncoded'], as_index=False).agg(['mean', 'count', 'sum'])

In [None]:
artists_encoded[['edadEncoded', 'top']].groupby(['edadEncoded'], as_index=False).agg(['mean', 'count', 'sum'])