In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


In [None]:
music_data = pd.read_csv("C:/Users/DELL/projects/Music Info/Music Info.csv")
music_data.head(2)

In [None]:
# Analyze most popular artists
artist_counts = music_data['artist'].value_counts()
print("Top 10 Most Popular Artists:")
print(artist_counts.head(10))


In [None]:
# Analyze genres from tags
all_genres = []
for tags in music_data['tags']:
    if isinstance(tags, str):
        genres = [g.strip() for g in tags.split(',')]
        all_genres.extend(genres)

genre_counts = pd.Series(all_genres).value_counts()
print("\nTop 10 Most Popular Genres:")
print(genre_counts.head(10))

In [None]:
plt.figure(figsize=(12, 6))
genre_counts.head(10).plot(kind='bar')
plt.title('Top 10 Genres',fontweight='bold')
plt.xticks(rotation=45, ha='right');

In [None]:
plt.figure(figsize=(12, 6))
artist_counts.head(10).plot(kind='bar')
plt.title('Top 10 Artists')
plt.xticks(rotation=45, ha='right');

In [None]:
# Analysis of Music Trends Over Time

# Calculate yearly trends
yearly_trends = music_data.groupby('year').agg({
    'danceability': 'mean',
    'energy': 'mean',
    'loudness': 'mean',
    'valence': 'mean',
    'tempo': 'mean',
    'acousticness': 'mean'
}).reset_index()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 15), sharex=True)
fig.suptitle('Evolution of Music Characteristics Over Time', fontsize=16)


features = ['danceability', 'energy', 'loudness', 'valence', 'tempo', 'acousticness']
for idx, feature in enumerate(features):
    row = idx // 2
    col = idx % 2
    
    sns.regplot(data=yearly_trends, x='year', y=feature, ax=axes[row, col])
    axes[row, col].set_title(f'Average {feature.capitalize()} Over Time')
    axes[row, col].set_xlabel('Year')
    axes[row, col].set_ylabel(feature.capitalize())

In [None]:
# Analyze genre distribution
music_data['decade'] = (music_data['year'] // 10) * 10
plt.figure(figsize=(15, 8))
genre_by_decade = music_data.groupby('decade')['genre'].value_counts().unstack()
genre_by_decade.plot(kind='bar', stacked=True)
plt.title('Genre Distribution by Decade')
plt.xlabel('Decade')
plt.ylabel('Number of Songs')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1));

In [None]:
# Analyze song duration trends
yearly_duration = music_data.groupby('year')['duration_ms'].agg(['mean', 'std']).reset_index()
yearly_duration['mean_minutes'] = yearly_duration['mean'] / (1000 * 60)

plt.figure(figsize=(12, 6))
plt.plot(yearly_duration['year'], yearly_duration['mean_minutes'])
plt.fill_between(
    yearly_duration['year'],
    (yearly_duration['mean'] - yearly_duration['std'])/(1000 * 60),
    (yearly_duration['mean'] + yearly_duration['std'])/(1000 * 60),
    alpha=0.2
)
plt.title('Average Song Duration Over Time')
plt.xlabel('Year')
plt.ylabel('Duration (minutes)')
plt.grid(True, alpha=0.5)


In [None]:
selected_columns = ['duration_ms', 'danceability', 'energy', 'key',
                    'loudness', 'mode', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 
                    'tempo', 'time_signature']

# Extract the relevant data from the DataFrame
music_info = music_data[selected_columns]

# Calculate the correlation matrix
correlation_matrix = music_info.corr()

# Plot the correlation matrix using seaborn
plt.figure(figsize=(15, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Music Features', fontweight='bold')
plt.xticks(rotation=45, ha='right');

In [None]:
listening_history = pd.read_csv("C:/Users/DELL/projects/User Listening History/User Listening History.csv")

listening_history.head()


In [None]:
# Getting play counts for each song 
play_counts = listening_history["track_id"].value_counts()
music_data["popularity"] = music_data["track_id"].map(play_counts).fillna(0)


In [None]:
# Getting popular song (top 25% by play count)
popularity_threshold = music_data["popularity"].quantile(0.75)
popular_songs = music_data[music_data["popularity"] >= popularity_threshold]


In [None]:
# Extracting and counting tags from popular songs
popular_tags = []
for tags in popular_songs["tags"]:
    if isinstance(tags, str):
        tag_list = [tag.strip().lower() for tag in tags.split(",")]
        popular_tags.extend(tag_list)

# Creating tag frequency for tags
tag_frequency = pd.Series(popular_tags).value_counts()

In [None]:
# Plotting the top 20 tags among the population
plt.figure(figsize=(15, 8))
tag_frequency.head(20).plot(kind= "bar")
plt.title("Top 20 Keywords In Popular Songs", fontweight = "bold")
plt.xlabel("Keywords")
plt.ylabel("Frequency")
plt.xticks(rotation = 45, ha= "right");

In [None]:
# The top 20 keywords in popular songs
print("Top 20 Keywords in Popular Songs: ")
tag_frequency.head(20)

In [None]:
top_tags = list(tag_frequency.head(5).index)

print("Average Audio Feature for Songs with Top Tags:")
for tag in top_tags:
    songs_with_tags = music_data[music_data["tags"].str.contains(tag, na= False)]

    print(f"\nTag: {tag}")
    for feat in features:
        avg_value = songs_with_tags[feat].mean()
        print(f"{feat}: {avg_value:.3f}")

In [None]:
# Creating a heatmap of tag co-occurrence in popular songs
tag_pairs = []
for tags in popular_songs['tags']:
    if isinstance(tags, str):
        tag_list = [tag.strip().lower() for tag in tags.split(',')]
        for i in range(len(tag_list)):
            for j in range(i + 1, len(tag_list)):
                tag_pairs.append(tuple(sorted([tag_list[i], tag_list[j]])))

pair_freq = pd.Series(tag_pairs).value_counts()

print("\nMost Common Tag Combinations in Popular Songs:")
print(pair_freq.head(10))

# Plot correlation between audio features for popular songs
plt.figure(figsize=(10, 5))
correlation = popular_songs[features].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation of Audio Features in Popular Songs');

In [None]:
music_data = pd.read_csv("C:/Users/DELL/projects/Music Info/Music Info.csv")
music_data.head(2)

In [None]:
# Analyze most popular artists
artist_counts = music_data['artist'].value_counts()
print("Top 10 Most Popular Artists:")
print(artist_counts.head(10))


In [None]:
# Analyze genres from tags
all_genres = []
for tags in music_data['tags']:
    if isinstance(tags, str):
        genres = [g.strip() for g in tags.split(',')]
        all_genres.extend(genres)

genre_counts = pd.Series(all_genres).value_counts()
print("\nTop 10 Most Popular Genres:")
print(genre_counts.head(10))

In [None]:
plt.figure(figsize=(12, 6))
genre_counts.head(10).plot(kind='bar')
plt.title('Top 10 Genres',fontweight='bold')
plt.xticks(rotation=45, ha='right');

In [None]:
plt.figure(figsize=(12, 6))
artist_counts.head(10).plot(kind='bar')
plt.title('Top 10 Artists')
plt.xticks(rotation=45, ha='right');

In [None]:
# Analysis of Music Trends Over Time

# Calculate yearly trends
yearly_trends = music_data.groupby('year').agg({
    'danceability': 'mean',
    'energy': 'mean',
    'loudness': 'mean',
    'valence': 'mean',
    'tempo': 'mean',
    'acousticness': 'mean'
}).reset_index()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(15, 15), sharex=True)
fig.suptitle('Evolution of Music Characteristics Over Time', fontsize=16)


features = ['danceability', 'energy', 'loudness', 'valence', 'tempo', 'acousticness']
for idx, feature in enumerate(features):
    row = idx // 2
    col = idx % 2
    
    sns.regplot(data=yearly_trends, x='year', y=feature, ax=axes[row, col])
    axes[row, col].set_title(f'Average {feature.capitalize()} Over Time')
    axes[row, col].set_xlabel('Year')
    axes[row, col].set_ylabel(feature.capitalize())

In [None]:
# Analyze genre distribution
music_data['decade'] = (music_data['year'] // 10) * 10
plt.figure(figsize=(15, 8))
genre_by_decade = music_data.groupby('decade')['genre'].value_counts().unstack()
genre_by_decade.plot(kind='bar', stacked=True)
plt.title('Genre Distribution by Decade')
plt.xlabel('Decade')
plt.ylabel('Number of Songs')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1));

In [None]:
# Analyze song duration trends
yearly_duration = music_data.groupby('year')['duration_ms'].agg(['mean', 'std']).reset_index()
yearly_duration['mean_minutes'] = yearly_duration['mean'] / (1000 * 60)

plt.figure(figsize=(12, 6))
plt.plot(yearly_duration['year'], yearly_duration['mean_minutes'])
plt.fill_between(
    yearly_duration['year'],
    (yearly_duration['mean'] - yearly_duration['std'])/(1000 * 60),
    (yearly_duration['mean'] + yearly_duration['std'])/(1000 * 60),
    alpha=0.2
)
plt.title('Average Song Duration Over Time')
plt.xlabel('Year')
plt.ylabel('Duration (minutes)')
plt.grid(True, alpha=0.5)


In [None]:
selected_columns = ['duration_ms', 'danceability', 'energy', 'key',
                    'loudness', 'mode', 'speechiness', 'acousticness', 
                    'instrumentalness', 'liveness', 'valence', 
                    'tempo', 'time_signature']

# Extract the relevant data from the DataFrame
music_info = music_data[selected_columns]

# Calculate the correlation matrix
correlation_matrix = music_info.corr()

# Plot the correlation matrix using seaborn
plt.figure(figsize=(15, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Music Features', fontweight='bold')
plt.xticks(rotation=45, ha='right');

In [None]:
listening_history = pd.read_csv("C:/Users/DELL/projects/User Listening History/User Listening History.csv")

listening_history.head()


In [None]:
# Getting play counts for each song 
play_counts = listening_history["track_id"].value_counts()
music_data["popularity"] = music_data["track_id"].map(play_counts).fillna(0)


In [None]:
# Getting popular song (top 25% by play count)
popularity_threshold = music_data["popularity"].quantile(0.75)
popular_songs = music_data[music_data["popularity"] >= popularity_threshold]


In [None]:
# Extracting and counting tags from popular songs
popular_tags = []
for tags in popular_songs["tags"]:
    if isinstance(tags, str):
        tag_list = [tag.strip().lower() for tag in tags.split(",")]
        popular_tags.extend(tag_list)

# Creating tag frequency for tags
tag_frequency = pd.Series(popular_tags).value_counts()

In [None]:
# Plotting the top 20 tags among the population
plt.figure(figsize=(15, 8))
tag_frequency.head(20).plot(kind= "bar")
plt.title("Top 20 Keywords In Popular Songs", fontweight = "bold")
plt.xlabel("Keywords")
plt.ylabel("Frequency")
plt.xticks(rotation = 45, ha= "right");

In [None]:
# The top 20 keywords in popular songs
print("Top 20 Keywords in Popular Songs: ")
tag_frequency.head(20)

In [None]:
top_tags = list(tag_frequency.head(5).index)

print("Average Audio Feature for Songs with Top Tags:")
for tag in top_tags:
    songs_with_tags = music_data[music_data["tags"].str.contains(tag, na= False)]

    print(f"\nTag: {tag}")
    for feat in features:
        avg_value = songs_with_tags[feat].mean()
        print(f"{feat}: {avg_value:.3f}")

In [None]:
# Creating a heatmap of tag co-occurrence in popular songs
tag_pairs = []
for tags in popular_songs['tags']:
    if isinstance(tags, str):
        tag_list = [tag.strip().lower() for tag in tags.split(',')]
        for i in range(len(tag_list)):
            for j in range(i + 1, len(tag_list)):
                tag_pairs.append(tuple(sorted([tag_list[i], tag_list[j]])))

pair_freq = pd.Series(tag_pairs).value_counts()

print("\nMost Common Tag Combinations in Popular Songs:")
print(pair_freq.head(10))

# Plot correlation between audio features for popular songs
plt.figure(figsize=(10, 5))
correlation = popular_songs[features].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation of Audio Features in Popular Songs');