In [1]:
#All librariers necessary
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Call in the raw.csv scraped file
billboard = pd.read_csv("billboard_raw.csv", delimiter=',', encoding='utf-8')

In [3]:
# Display the dataframe
display(billboard)

Unnamed: 0,song_name,artists,rank
0,Uptown Funk!,Mark Ronson Featuring Bruno Mars,1
1,Party Rock Anthem,LMFAO Featuring Lauren Bennett & GoonRock,2
2,Shape Of You,Ed Sheeran,3
3,Closer,The Chainsmokers Featuring Halsey,4
4,Girls Like You,Maroon 5 Featuring Cardi B,5
...,...,...,...
95,Panda,Desiigner,96
96,Break Your Heart,Taio Cruz Featuring Ludacris,97
97,In My Feelings,Drake,98
98,Wrecking Ball,Miley Cyrus,99


In [4]:
# Call in the Kaggle dataset
top10 = pd.read_csv("top10s.csv", delimiter=',', encoding='ISO-8859-1')

FileNotFoundError: [Errno 2] No such file or directory: 'top10s.csv'

In [None]:
# Display the Kaggle Dataset
display(top10)

In [None]:
# Change title to song_name
top10 = top10.rename(columns= {"title": "song_name"})

In [None]:
# Merge the two dataframes into one on song_name using an inner join to only use the songs in both dataframes
merged_df = pd.merge(top10, billboard, on=['song_name'], how='inner')

In [None]:
# Display the merged dataframe
display(merged_df)

In [None]:
# Change all column names to follow the names in the final report data dictionary
merged_df = merged_df.rename(columns= {"nrgy": "energy"})
merged_df = merged_df.rename(columns= {"dnce": "dance"})
merged_df = merged_df.rename(columns= {"top genre": "top_genre"})
merged_df = merged_df.rename(columns= {"dur": "duration"})
merged_df = merged_df.rename(columns= {"acous": "acoustic"})
merged_df = merged_df.rename(columns= {"spch": "speech"})

# Only keep one artist page
merged_df = merged_df.drop(columns= {"artist"})
merged_df = merged_df.rename(columns= {"artists": "artist"})

In [None]:
# Create a binary column: 1 for multiple artists, 0 for a single artist
merged_df['multiple_artists'] = merged_df['artist'].apply(lambda x: 1 if any(keyword in x for keyword in ['Featuring', '+', '&', ',']) else 0)

# Display the updated DataFrame
print(merged_df[['artist', 'multiple_artists']].head())

In [None]:
duplicates = merged_df.duplicated(subset=['song_name'])

# Display the rows that have duplicate song names
print(merged_df[duplicates])

In [None]:
# Drop duplicate song names, keeping the first occurrence
merged_df_no_duplicates = merged_df.drop_duplicates(subset=['song_name'], keep='first')

In [None]:
final_df = merged_df_no_duplicates.reset_index(drop=True)

# Display the updated DataFrame with the new index
display(final_df)

In [None]:
# Create energy bins to use for later analysis
final_df['energy_bin'] = pd.cut(final_df['energy'], bins=[0, 50, 75, 100], labels=['Low', 'Medium', 'High'])

In [None]:
# Display final dataset to use
display(final_df)

## Question 1

What attributes have the strongest influence on a highly ranked song on Billboard1? Are there any attributes that have little to no influence on a songs’ ranking?

In [None]:
# Select only the relevant columns for correlation analysis
columns_of_interest = ['year', 'bpm', 'energy', 'dance', 'dB', 'val', 'duration', 'acoustic', 'speech', 'pop', 'rank']
data_subset = final_df[columns_of_interest]

# Calculate the correlation matrix
correlation_matrix = data_subset.corr()

# Create a heatmap to visualize the correlations
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

# Title for the heatmap
plt.title('Correlation Heatmap of Attributes vs Rank')

#Show heatmap
plt.show()

# Identify attributes with the highest and lowest correlation to rank
strong_correlation = correlation_matrix['rank'].abs().sort_values(ascending=False)

#Print correlation for each attribute
print("Attributes with the strongest influence on Rank:")
print(strong_correlation)


<br>

## Question 2

Do songs with multiple artists have any effect on song rank and song popularity? Is song rank or popularity affected more?

In [None]:
# Variables of interest
variables_of_interest = ['multiple_artists', 'rank', 'pop']
data_subset = final_df[variables_of_interest]

# Compare means for rank and popularity
grouped_data = data_subset.groupby('multiple_artists').mean()
mean_rank_multiple = grouped_data.loc[1, 'rank'] if 1 in grouped_data.index else float('nan')
mean_rank_single = grouped_data.loc[0, 'rank'] if 0 in grouped_data.index else float('nan')
mean_pop_multiple = grouped_data.loc[1, 'pop'] if 1 in grouped_data.index else float('nan')
mean_pop_single = grouped_data.loc[0, 'pop'] if 0 in grouped_data.index else float('nan')

# Calculate differences in means for visualization
rank_difference = mean_rank_multiple - mean_rank_single
pop_difference = mean_pop_multiple - mean_pop_single

# Print results
print("Mean Comparison:")
print(f"Rank (Multiple Artists): {mean_rank_multiple:.2f}, Rank (Single Artist): {mean_rank_single:.2f}")
print(f"Popularity (Multiple Artists): {mean_pop_multiple:.2f}, Popularity (Single Artist): {mean_pop_single:.2f}")
print(f"Difference in Rank Means: {rank_difference:.2f}")
print(f"Difference in Popularity Means: {pop_difference:.2f}")

# Visualize the differences
plt.figure(figsize=(12, 6))

# Boxplot for Rank
plt.subplot(1, 2, 1)
sns.boxplot(x='multiple_artists', y='rank', data=data_subset)

# Title
plt.title('Rank Distribution by Artist Type')

# X-axis
plt.xlabel('Multiple Artists (0 = Single, 1 = Multiple)')

# Y-axis
plt.ylabel('Rank')

# Boxplot for Popularity
plt.subplot(1, 2, 2)
sns.boxplot(x='multiple_artists', y='pop', data=data_subset)

# Title
plt.title('Popularity Distribution by Artist Type')

# X-label
plt.xlabel('Multiple Artists (0 = Single, 1 = Multiple)')

# Y-label
plt.ylabel('Popularity')

# Print box plots
plt.show()


<br>

## Question 3

Are there specific years in the dataset where popularity or rank deviates significantly?

In [None]:
# Group data by year and calculate mean values for the specified columns
grouped_by_year = final_df.groupby('year')[['pop', 'rank', 'bpm', 'energy', 'dance', 'dB', 'val', 'duration', 'acoustic', 'speech']].mean()

# Visualize popularity and rank over the years
plt.figure(figsize=(14, 6))

# Line plot for Popularity
plt.subplot(1, 2, 1)
plt.plot(grouped_by_year.index, grouped_by_year['pop'], marker='o')

# Title
plt.title('Average Popularity by Year')

# X-axis
plt.xlabel('Year')

# Y-axis
plt.ylabel('Popularity')

# True
plt.grid(True)

# Line plot for Rank
plt.subplot(1, 2, 2)
plt.plot(grouped_by_year.index, grouped_by_year['rank'], marker='o', color='orange')

# Title
plt.title('Average Rank by Year')

#X-axis
plt.xlabel('Year')

# Y-axis
plt.ylabel('Rank')
plt.grid(True)

# Show the graphs
plt.show()


<br>

## Question 4

Do songs with higher energy consistently have better popularity across genres? 

In [None]:
# Group by energy bins and genres to calculate average popularity
energy_bins_genre_popularity = final_df.groupby(['energy_bin', 'top_genre'], observed=False)['pop'].mean().reset_index()

# Plot energy bins versus popularity across genres
plt.figure(figsize=(12, 8))
sns.barplot(x='energy_bin', y='pop', hue='top_genre', data=energy_bins_genre_popularity, errorbar=None)

# Title of graph
plt.title('Energy Level and Popularity Across Genres', pad=20)

# X-axis
plt.xlabel('Energy Bins')

# Y-axis
plt.ylabel('Average Popularity')

# Lengend
plt.legend(title='Top Genre', bbox_to_anchor=(1.05, 1), loc='upper left')

# Show the graph
plt.show()



<br>

## Question 5

Has danceability, duration, energy increased throughout the decade? 

In [None]:
# Variables of interest
columns_of_interest = ['year', 'dance', 'duration', 'energy']
data = final_df[columns_of_interest]

# Group by year and calculate mean values for danceability, duration, and energy
grouped_by_year = data.groupby('year').mean().reset_index()

# Visualize trends over the decade
plt.figure(figsize=(14, 8))

# Line plot for Duration
plt.plot(grouped_by_year['year'], grouped_by_year['duration'], marker='o', label='Duration', color='orange')

# Title
plt.title('Trends in Duration Over the Decade')

# X label
plt.xlabel('Year')

# Y label
plt.ylabel('Average Duration')

# Legend
plt.legend()
plt.grid(True)

# Show duartion graph
plt.show()

# Create a separate plot for Danceability and Energy
plt.figure(figsize=(14, 8))

# Line plot for Danceability
plt.plot(grouped_by_year['year'], grouped_by_year['dance'], marker='o', label='Danceability', color='blue')

# Line plot for Energy
plt.plot(grouped_by_year['year'], grouped_by_year['energy'], marker='o', label='Energy', color='green')

# Add titles 
plt.title('Trends in Danceability and Energy Over the Decade')

# Add labels
plt.xlabel('Year')
plt.ylabel('Average Values')
plt.legend()
plt.grid(True)

# Print graph
plt.show()

# Print the grouped data for inspection
print("Average values by year:")
print(grouped_by_year)


<br>

## Question 6

What genre or genres are considered to have the most highly ranked songs on Billboard1?

In [None]:
# Count the occurrences of each genre in the final dataset
genre_counts = final_df['top_genre'].value_counts()

# Visualize the top genres with the most highly ranked songs
plt.figure(figsize=(14, 8))
genre_counts.head(10).plot(kind='bar', color='skyblue')

# Add title
plt.title('Top Genres with Most Highly Ranked Songs on Billboard')

# Add labels
plt.xlabel('Genre')
plt.ylabel('Count')

# Make it easy to read
plt.xticks(rotation=45)

# Show graph
plt.show()

# Print the genre counts for inspection
print("Top genres by count:")
print(genre_counts)
