In [1]:
import pandas as pd
import pprint
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 150)

In [2]:
stream_final_df = pd.read_csv('data/spotify_streaming_cleaned.csv')

In [3]:
stream_final_df.groupby('mountain_hour').agg({
    'ms_played': 'sum', 
    'ts': 'count',
    'master_metadata_album_artist_name': 'first'
}).sort_values(by='ms_played', ascending=False)

Unnamed: 0_level_0,ms_played,ts,master_metadata_album_artist_name
mountain_hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14,714122377,7591,Frank Sinatra
15,671211453,7890,Hudson Thames
17,643587067,8721,Birdy
16,613690078,7530,Victorious Cast
13,587875991,6124,Dame Dolla
18,545862600,7639,Cher Lloyd
12,492109627,4968,Hudson Thames
19,448448725,6718,Saint Motel
21,446635059,7339,Nikolai Medtner
20,426287584,6023,Hudson Thames


In [4]:
len(stream_final_df["spotify_track_uri"].unique())

30706

In [5]:
track_uri_df = stream_final_df.groupby('spotify_track_uri').agg({
    'master_metadata_track_name' : 'first',
    'master_metadata_album_artist_name' : 'first'
})

In [6]:
track_uri_df.reset_index()

track_uri_df.to_csv("data/unique_songs.csv")

In [7]:
stream_final_df.loc[stream_final_df["ms_played"] < 30000]

Unnamed: 0.1,Unnamed: 0,ts,ms_played,conn_country,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,reason_start,reason_end,shuffle,skipped,timestamp,mountain_time,mountain_hour,year,month
0,0,2016-01-08T16:57:23Z,14257,US,Something New,Secret Weapons,Something New,spotify:track:3Tg7AXyFD0frtV3CAVjyZ3,fwdbtn,endplay,True,True,2016-01-08 16:57:23+00:00,2016-01-08 09:57:23-07:00,9,2016,1
2,2,2016-01-08T17:01:09Z,994,US,Survivors,Selena Gomez,Revival,spotify:track:6mEFvpqnzElavdz3gfP2XT,trackdone,endplay,True,True,2016-01-08 17:01:09+00:00,2016-01-08 10:01:09-07:00,10,2016,1
3,3,2016-01-08T17:01:11Z,2461,US,Without Me,Eminem,The Eminem Show,spotify:track:7lQ8MOhq6IN2w8EYcFNSUk,clickrow,fwdbtn,True,True,2016-01-08 17:01:11+00:00,2016-01-08 10:01:11-07:00,10,2016,1
4,4,2016-01-08T17:01:12Z,1044,US,Magic,San Cisco,Gracetown,spotify:track:2madhIZgRsvGBGyjSOnlMl,fwdbtn,fwdbtn,True,True,2016-01-08 17:01:12+00:00,2016-01-08 10:01:12-07:00,10,2016,1
6,6,2016-01-08T17:03:31Z,1834,US,Traum,CRO,Traum,spotify:track:7jIRQUJbgJSakbO87fZihr,fwdbtn,fwdbtn,True,True,2016-01-08 17:03:31+00:00,2016-01-08 10:03:31-07:00,10,2016,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111517,111626,2025-03-10T17:19:20Z,1834,US,Gangsta Kid,SHY FX,"Jungle Hits, Vol. 1",spotify:track:6bFSjdbVlm4FQeqUO6hwGe,playbtn,fwdbtn,False,True,2025-03-10 17:19:20+00:00,2025-03-10 11:19:20-06:00,11,2025,3
111521,111630,2025-03-10T17:27:08Z,7337,US,Jungle Buddy Bye,Johnny Osbourne,Ragga Jungle Anthems Vol. Two,spotify:track:6Hz7YEzHb3FrXpqadKYb3E,fwdbtn,fwdbtn,True,True,2025-03-10 17:27:08+00:00,2025-03-10 11:27:08-06:00,11,2025,3
111522,111631,2025-03-10T17:27:10Z,2530,US,It's Jazzy,Roni Size,V Classic,spotify:track:7wVqeFQqdQ8V6wNjbFKYIv,fwdbtn,fwdbtn,True,True,2025-03-10 17:27:10+00:00,2025-03-10 11:27:10-06:00,11,2025,3
111523,111632,2025-03-10T17:27:11Z,371,US,Under Mi Sensi (Jungle Spliff) X Project Remix,Barrington Levy,Ragga Jungle Anthems Vol. One,spotify:track:0rFGpzZJM4gveQPlusXtGX,fwdbtn,backbtn,True,True,2025-03-10 17:27:11+00:00,2025-03-10 11:27:11-06:00,11,2025,3


In [9]:
stream_final_df.dtypes

Unnamed: 0                            int64
ts                                   object
ms_played                             int64
conn_country                         object
master_metadata_track_name           object
master_metadata_album_artist_name    object
master_metadata_album_album_name     object
spotify_track_uri                    object
reason_start                         object
reason_end                           object
shuffle                                bool
skipped                                bool
timestamp                            object
mountain_time                        object
mountain_hour                         int64
year                                  int64
month                                 int64
dtype: object

In [14]:
# Convert mountain_time from object to datetime
stream_final_df['mountain_time'] = pd.to_datetime(stream_final_df['mountain_time'], errors='coerce')

# Check if any values couldn't be converted (will be NaT)
missing_dates = stream_final_df['mountain_time'].isna().sum()
print(f"Number of rows with invalid dates: {missing_dates}")

# If there are invalid dates, you might want to examine them
if missing_dates > 0:
    invalid_dates = stream_final_df.loc[stream_final_df['mountain_time'].isna(), 'mountain_time']
    print("Sample of invalid date strings:")
    print(invalid_dates.head())

# Once converted, create the day of week columns
stream_final_df['day_of_week'] = stream_final_df['mountain_time'].dt.dayofweek

day_map = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}
stream_final_df['day_name'] = stream_final_df['day_of_week'].map(day_map)

Number of rows with invalid dates: 73370
Sample of invalid date strings:
792   NaT
793   NaT
794   NaT
795   NaT
796   NaT
Name: mountain_time, dtype: datetime64[ns, UTC-07:00]


In [15]:
stream_final_df

Unnamed: 0.1,Unnamed: 0,ts,ms_played,conn_country,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,reason_start,reason_end,shuffle,skipped,timestamp,mountain_time,mountain_hour,year,month,day_of_week,day_name
0,0,2016-01-08T16:57:23Z,14257,US,Something New,Secret Weapons,Something New,spotify:track:3Tg7AXyFD0frtV3CAVjyZ3,fwdbtn,endplay,True,True,2016-01-08 16:57:23+00:00,2016-01-08 09:57:23-07:00,9,2016,1,4.0,Friday
1,1,2016-01-08T17:01:07Z,229080,US,Same Old Love,Selena Gomez,Revival,spotify:track:1BZG99C7Co1r6QUC3zaS59,clickrow,trackdone,True,False,2016-01-08 17:01:07+00:00,2016-01-08 10:01:07-07:00,10,2016,1,4.0,Friday
2,2,2016-01-08T17:01:09Z,994,US,Survivors,Selena Gomez,Revival,spotify:track:6mEFvpqnzElavdz3gfP2XT,trackdone,endplay,True,True,2016-01-08 17:01:09+00:00,2016-01-08 10:01:09-07:00,10,2016,1,4.0,Friday
3,3,2016-01-08T17:01:11Z,2461,US,Without Me,Eminem,The Eminem Show,spotify:track:7lQ8MOhq6IN2w8EYcFNSUk,clickrow,fwdbtn,True,True,2016-01-08 17:01:11+00:00,2016-01-08 10:01:11-07:00,10,2016,1,4.0,Friday
4,4,2016-01-08T17:01:12Z,1044,US,Magic,San Cisco,Gracetown,spotify:track:2madhIZgRsvGBGyjSOnlMl,fwdbtn,fwdbtn,True,True,2016-01-08 17:01:12+00:00,2016-01-08 10:01:12-07:00,10,2016,1,4.0,Friday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111535,111644,2025-03-11T17:40:18Z,247640,US,The Burial,Leviticus,"Jungle Hits, Vol. 1",spotify:track:0BIAlQguOM4MSEDUjqOqkc,trackdone,trackdone,True,False,2025-03-11 17:40:18+00:00,NaT,11,2025,3,,
111536,111645,2025-03-11T17:45:03Z,283211,US,Here I Come (feat. Mega Banton),Barrington Levy,Ragga Jungle Anthems Vol. Two,spotify:track:2kIZcL8XslmgWISM8zxa93,trackdone,fwdbtn,True,True,2025-03-11 17:45:03+00:00,NaT,11,2025,3,,
111537,111646,2025-03-11T17:45:16Z,12933,US,The Angels Fell - 2015 Remaster,Dillinja,The Angels Fell / Ja Know Ya Big / Brutal Bass...,spotify:track:4PVzD2FbJuMi5TgKNEVURw,fwdbtn,endplay,True,True,2025-03-11 17:45:16+00:00,NaT,11,2025,3,,
111538,111647,2025-03-11T17:52:33Z,437152,US,So Long,Seba,Producer 06,spotify:track:6ChYON7kJVT2RNtKmQZaJL,playbtn,trackdone,False,False,2025-03-11 17:52:33+00:00,NaT,11,2025,3,,


In [None]:
# Group by day of week and hour, count plays
heatmap_data = stream_final_df.groupby(['day_name', 'mountain_hour']).size().reset_index(name='play_count')

# Pivot the data to create a matrix
pivot_data = heatmap_data.pivot(index='day_name', columns='mountain_hour', values='play_count')

# Reorder the days (optional, to get Monday at the top or Sunday at the top)
# For Monday at top:
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
pivot_data = pivot_data.reindex(day_order)

# Create the heatmap
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 8))
sns.heatmap(pivot_data, cmap='Blues', annot=True, fmt='g')
plt.title('Spotify Listening Patterns by Day of Week and Hour')
plt.yticks(rotation=45)
plt.yticks(rotation=45, ha='right')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.show()

In [None]:
reasons_df = stream_final_df.groupby(["reason_end"]).agg({
    'ts': 'count'
    
    }
).reset_index().sort_values(by="ts")



In [None]:
reasons_df

In [None]:
stream_final_df["reason_start"].unique()

In [None]:
streamed_year_df.to_csv("tracks_by_time_and_streams.csv")

In [None]:
streamed_year_df.sort_values(by='ms_played', ascending=False)

In [None]:
import plotly.express as px

streamed_df_sorted = streamed_year_df.sort_values(by='ms_played', ascending=False)

def truncate_text(text, max_length=20):
    if isinstance(text, str) and len(text) > max_length:
        return text[:max_length] + '...'
    return text

# Apply truncation to track names
streamed_df_sorted['truncated_track_names'] = streamed_df_sorted['master_metadata_track_name'].apply(
    lambda x: truncate_text(x, max_length=20)
)

streamed_df_sorted['hours_played'] = streamed_df_sorted['ms_played'] / (1000 * 60 * 60)
streamed_df_sorted['avg_time_per_play'] = (streamed_df_sorted['ms_played'] / streamed_df_sorted['ts']) / (1000 * 60)

streamed_df_sorted.head(20)
new_fig = px.bar(
    streamed_df_sorted.head(20), 
    x='truncated_track_names', 
    y='hours_played',
    custom_data=['master_metadata_track_name', 'master_metadata_album_artist_name', 'hours_played', 'avg_time_per_play']
)
new_fig.update_traces(
    hovertemplate='<b>%{customdata[0]}</b><br>' +
                  'Artist: %{customdata[1]}<br>' +
                  'Total play time: %{y:.2f} hours<br>' +  
                  'Avg time per play: %{customdata[3]:.2f} minutes<extra></extra>'
)

new_fig.update_layout(
    yaxis_title="Hours played",
    xaxis=dict(
        title="Track Name",
        tickangle=45,
        tickfont=dict(size=10)
    ),
    margin=dict(b=100)
)


new_fig.show()


In [None]:
stream_final_df.groupby(["master_metadata_track_name"])[["ms_played"]].sum().sort_values(by="ms_played", ascending=False)


In [None]:
streamed_final_gb_df = streamed_final_df.groupby(['master_metadata_track_name', 'master_metadata_album_artist_name']).agg({   
    'ms_played': 'sum', 
    'ts': 'count',
    #'master_metadata_track_name': 'count'
})

streamed_final_gb_df = streamed_final_gb_df.reset_index()




In [None]:
streamed_final_gb_df = streamed_final_gb_df.sort_values(by="ms_played", ascending=False)

streamed_final_gb_df

# Questions

### 1. What consitutes a "play"? Use Machine learning to determine a good range. 
### 2. What times do listen to the most, and based on that information, when am I most likely to listen?
### 3. What are my clustering listening habits? What kinds of music do I like listening to, and when?
### 4. Using Spotify API data, what kinds of music would I like moving forward?
### 5. What effect does skipping have on my viewing habits? Or just any action?
