In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df = pd.read_csv('data/spotify_data.csv')

## Distribution analysis

In [None]:
genre_distribution = df['genre'].value_counts().reset_index()
genre_distribution.columns = ['Genre', 'Count']
fig = px.bar(genre_distribution, x='Genre', y='Count', title='Distribution of Songs Across Genres')
fig.update_layout(xaxis_title='Genre', yaxis_title='Count')
fig.show()

#### The chart isn't too readable. Checking top 15 songs across different genres

In [None]:
genre_distribution = df['genre'].value_counts().reset_index().head(15)
genre_distribution.columns = ['Genre', 'Count']
fig = px.bar(genre_distribution, x='Genre', y='Count', title='Top 15 Songs Across Genres')
fig.update_layout(xaxis_title='Genre', yaxis_title='Count')
fig.show()

### Black-metal, gospel and ambient are the top 3 songs across different genres

In [None]:
numerical_attributes = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']

for attribute in numerical_attributes:
    fig = px.histogram(df, x=attribute, title=f'Histogram of {attribute}')
    fig.show()


### Popularity does not have a good distribution. Thus, we sub-set the data from 1 to 40 and rescale it from 1-10.

In [None]:
df_popular= df[(df['popularity'] > 0) & (df ['popularity']<41)]
df_popular

In [None]:
fig = px.histogram(df_popular, x='popularity', title=f'Histogram of popularity')
fig.show()

### New distribution of popularity looks even

In [None]:
genre_counts

### We should eliminate the 'songwriter' genre

## Correlation analysis

In [None]:
numerical_attributes = ['popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
correlation_matrix = df[numerical_attributes].corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Numerical Variables')
plt.show()

### Energy looks somewhat highly correlated with accousticness and loudness. Whether or not to drop these features would be determined by the feature selection analysis

## Outlier analysis

![outlier analysis](https://di.samizdat.co/2020/content/images/2020/01/image-3.png)

In [None]:
numerical_attributes = ['popularity', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']

def analyze_numerical_attribute(attribute):
    Q1 = np.percentile(df[attribute], 25)
    Q2 = np.percentile(df[attribute], 50)
    Q3 = np.percentile(df[attribute], 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[attribute] < lower_bound) | (df[attribute] > upper_bound)]
    percentage_outliers = (len(outliers) / len(df)) * 100
    skewness = df[attribute].skew()
    
    if skewness > 1:
        skew_type = 'Right-skewed'
    elif skewness < -1:
        skew_type = 'Left-skewed'
    else:
        skew_type = 'Normal'
    
    print(f'Attribute: {attribute}')
    print(f'Q1 (25th percentile): {Q1:.2f}')
    print(f'Median (Q2, 50th percentile): {Q2:.2f}')
    print(f'Q3 (75th percentile): {Q3:.2f}')
    print(f'IQR (Interquartile Range): {IQR:.2f}')
    print(f'Percentage of data outside outlier range (Q1-1.5*IQR and Q3+1.5*IQR): {percentage_outliers:.2f}%')
    print(f'Skewness: {skewness:.2f} ({skew_type})\n')

for attribute in numerical_attributes:
    analyze_numerical_attribute(attribute)


In [None]:
genre_popularity = df.groupby('genre')['popularity'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
genre_popularity.plot(kind='bar', color='skyblue')
plt.title('Average Popularity by Genre')
plt.xlabel('Genre')
plt.ylabel('Average Popularity')
plt.xticks(rotation=90)
plt.tight_layout()

plt.show()


In [None]:
genre_popularity = df.groupby('genre')['popularity'].mean().sort_values(ascending=False).head(10)
plt.figure(figsize=(12, 6))
genre_popularity.plot(kind='bar', color='skyblue')
plt.title('Average Popularity by Genre')
plt.xlabel('Genre')
plt.ylabel('Average Popularity')
plt.xticks(rotation=90)
plt.tight_layout()

plt.show()


### pop genre is the most popular

In [None]:
missing_data = df.isnull().sum()
print("Number of missing data in each variable:")
print(missing_data)

### data does not have missing values. We do not use the artist_name column in our analysis.

In [None]:
descriptive_stats = df.describe(include='all')
descriptive_stats