# Exploratory Data Analysis

### EDA with Data Summary
#### 1. Import libraries and load cleaned data

In [1]:
# import libraries
import pandas as pd

# load the cleaned data
music_tracks_df = pd.read_csv('./Cleaned_Data/Music_Streaming_History.csv')
podcast_episodes_df = pd.read_csv('./Cleaned_Data/Podcast_Streaming_History.csv')

#### 2. Understand data structure and content

In [None]:
# check the first few rows
print(music_tracks_df.head())
print(podcast_episodes_df.head())


# check column types, null counts and summary stats of numeric columns 
# music tracks
print(music_tracks_df.info()) 
print(music_tracks_df.describe())  

# podcast episodes
print(podcast_episodes_df.info())
print(podcast_episodes_df.describe())

#### 3. Check for nulls

In [None]:
# check for null values in music data
print(music_tracks_df.isnull().sum())

# check for null values in podcast data
print(podcast_episodes_df.isnull().sum())

'''
The null values are associated with `false` in the `offline` column.
Thus the null values are valid.
'''

### EDA with Visuals
#### a. Visualize top 20 most played music artists & podcast shows

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

# set font to Microsoft YaHei to show Chinese characters
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

# visualize top 20 most played artists
top_artists = music_tracks_df['artist_name'].value_counts().head(20)
sns.barplot(x=top_artists.values, y = top_artists.index, palette='Blues_r')
plt.title('Top 20 Most Played Artists')
plt.xlabel('Number of Plays')
plt.ylabel('Artist')
plt.show()

# visualize top 20 most played shows (podcast)
top_show = podcast_episodes_df['show_name'].value_counts().head(20)
sns.barplot(x=top_show.values, y = top_show.index, palette='Blues_r')
plt.title('Top 20 Most Played Shows')
plt.xlabel('Number of Plays')
plt.ylabel('Show')
plt.show()

![music_top_20_artist](/Images/Music_Top_20_Artists.png)
![podcast_top_20_shows](/Images/Podcast_Top_20_Shows.png)