# EDA on Streaming History
### General EDA
#### 1. Import Libraries and Load Cleaned Data

In [None]:
# import libraries
import pandas as pd

# load the cleaned data
music_tracks_df = pd.read_csv('./Cleaned_Data/Music_Streaming_History.csv')
podcast_episodes_df = pd.read_csv('./Cleaned_Data/Podcast_Streaming_History.csv')

#### 2. Understand data structure and content

In [None]:
# check the first few rows
print(music_tracks_df.head())
print(podcast_episodes_df.head())


# get statistical summary 
# music tracks
print(music_tracks_df.info()) 
print(music_tracks_df.describe())  

# podcast episodes
print(podcast_episodes_df.info())
print(podcast_episodes_df.describe())

#### 3. Check for nulls

In [None]:
# check for null values in music data
print(music_tracks_df.isnull().sum())

# check for null values in podcast data
print(podcast_episodes_df.isnull().sum())

'''
The null values are associated with `false` in the `offline` column.
Thus the null values are valid.
'''

#### 4. Identify Outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# load data
music_tracks_df = pd.read_csv('./Cleaned_Data/Music_Streaming_History.csv')
podcast_episodes_df = pd.read_csv('./Cleaned_Data/Podcast_Streaming_History.csv')
# create boxplot for minutes_played in music streaming dataset
plt.figure(figsize=(10,8))
# use minutes_played to create the boxplot
sns.boxplot(x=music_tracks_df['minutes_played'], palette='Blues_r')
plt.title('Boxplot of Minutes Played (Music)', fontsize = 20)
plt.xlabel('Minutes Played')
plt.show()

# create boxplot for minutes_played in podcast streaming dataset
plt.figure(figsize=(10,8))
# use minutes played to create the boxplot
sns.boxplot(x=podcast_episodes_df['minutes_played'], palette = 'Blues_r')
plt.title('Boxplot of Minuets Played (Podcast)', fontsize = 20)
plt.xlabel('Minuetes Played')
plt.show()


![music_boxplot](/Images/music_boxplot.png)
![podcast_boxplot](/Images/podcast_boxplot.png)  

#### Investigate Outliers

In [None]:
# filter out tracks that has minutes_played greater than 10
outliers = music_tracks_df[music_tracks_df['minutes_played'] > 10]
print(outliers[['track_name','artist_name','minutes_played','reason_start']])

# filter out podcasts that has minutes_played greater than 50
outliers = podcast_episodes_df[podcast_episodes_df['minutes_played'] > 50]
print(outliers[['show_name','minutes_played']])

### EDA with Visuals
#### a. Visualize top 20 most played music artists & podcast shows

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

# set font to Microsoft YaHei to show Chinese characters
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

# visualize top 20 most played artists
top_artists = music_tracks_df['artist_name'].value_counts().head(20)
sns.barplot(x=top_artists.values, y = top_artists.index, palette='Blues_r')
plt.title('Top 20 Most Played Artists', fontsize = 20)
plt.xlabel('Number of Plays')
plt.ylabel(None)
plt.show()

# visualize top 20 most played shows (podcast)
top_show = podcast_episodes_df['show_name'].value_counts().head(20)
# adjust width and height
plt.figure(figsize=(10, 8))  
sns.barplot(x=top_show.values, y = top_show.index, palette='Blues_r')
plt.title('Top 20 Most Played Shows', fontsize = 20)
plt.xlabel('Number of Plays')
plt.ylabel(None)
# ensure everything fits properly
plt.tight_layout()
plt.show()

![top_20_most_played_artists](/Images/top_20_played_artists.png)
![top_20_most_played_podcasts](/Images/top_20_played_shows.png)

#### b. Visualize Listening Trends Over Time

In [None]:
# visualize music listending trends over time
# group by month_year
music_month_year_trend = music_tracks_df.groupby('month_year').sum()['minutes_played']

# plot
music_month_year_trend.plot(figsize=(10, 8))
plt.title('Total Minutes Played Over Time (Music)', fontsize = 20)
plt.xlabel(None)
plt.ylabel('Total Minutes Played')
plt.grid(True)
plt.show()

# visualize podcast listending trends over time
# group by month_year
podcast_month_year_trend = podcast_episodes_df.groupby('month_year').sum()['minutes_played']

# plot
podcast_month_year_trend.plot(figsize=(10, 8))
plt.title('Total Minutes Played Over Time (Podcast)', fontsize = 20)
plt.xlabel(None)
plt.ylabel('Total Minutes Played')
plt.grid(True)
plt.show()

![music_over_time](/Images/Music_Played_Over_Time.png)
![podcast_over_time](/Images/Podcast_Played_Over_Time.png)

test test