In [1]:
#Importing pandas library and the dataset
import pandas as pd
import numpy as np
df = pd.read_csv('Sigmoid_Spotify_Data_2.csv')
#New dataframe with duration of every track 
duration = pd.read_csv('duration.csv')

In [2]:
#Deleting unnecessary columns
duration = duration.drop(['Unnamed: 0', 'endTime', 'msPlayed', 'sigmaritan'], axis = 1)

In [3]:
#Removing the duplicates of songs from the same listener
duration = duration[~pd.DataFrame(np.sort(duration[['trackName', 'artistName']], axis=1)).duplicated(keep='first')]

In [5]:
#Adding a new column to make the work easier
duration['song'] = duration['artistName'] +' '+ duration['trackName']

In [6]:
#Creating a dictionary, keys - unique songs, values - their duration
duration_dict = pd.Series(duration.miliseconds.values,index=duration.song).to_dict()

In [7]:
#Removing some columns
df = df.drop(['Unnamed: 0', 'endTime'], axis = 1)

In [8]:
#Adding a new column to make the work easier
df['song'] = df['artistName'] +' '+ df['trackName']

In [9]:
df

Unnamed: 0,sigmaritan,artistName,trackName,msPlayed,song
0,alex_clefos,Shinedown,My Name (Wearing Me Out),83780,Shinedown My Name (Wearing Me Out)
1,alex_clefos,Disturbed,Deify,1440453,Disturbed Deify
2,alex_clefos,Disturbed,Guarded,1776,Disturbed Guarded
3,alex_clefos,Disturbed,Deify,1002243,Disturbed Deify
4,alex_clefos,Disturbed,Guarded,1646,Disturbed Guarded
...,...,...,...,...,...
351885,vladimir_stojoc,Lil Nas X,INDUSTRY BABY (feat. Jack Harlow),212352,Lil Nas X INDUSTRY BABY (feat. Jack Harlow)
351886,vladimir_stojoc,bbno$,resume,164426,bbno$ resume
351887,vladimir_stojoc,YUNGBLUD,braindead!,163293,YUNGBLUD braindead!
351888,vladimir_stojoc,Rich Brian,Getcho Mans,211306,Rich Brian Getcho Mans


In [10]:
#Mapping through the dictionary the entire dataset
df["duration"] = df["song"].map(duration_dict)

In [11]:
#Removing the added column
df = df.drop(['song'], axis = 1)
df

Unnamed: 0,sigmaritan,artistName,trackName,msPlayed,duration
0,alex_clefos,Shinedown,My Name (Wearing Me Out),83780,216187.0
1,alex_clefos,Disturbed,Deify,1440453,256573.0
2,alex_clefos,Disturbed,Guarded,1776,200293.0
3,alex_clefos,Disturbed,Deify,1002243,256573.0
4,alex_clefos,Disturbed,Guarded,1646,200293.0
...,...,...,...,...,...
351885,vladimir_stojoc,Lil Nas X,INDUSTRY BABY (feat. Jack Harlow),212352,212000.0
351886,vladimir_stojoc,bbno$,resume,164426,164427.0
351887,vladimir_stojoc,YUNGBLUD,braindead!,163293,163293.0
351888,vladimir_stojoc,Rich Brian,Getcho Mans,211306,211307.0


In [12]:
#Grouping by sigmaritans and tracks and selecting only the unique songs listened to the longest
df2 = df.groupby(['sigmaritan', 'artistName', 'trackName'])['duration'].transform(max) == df['duration']

In [13]:
df = df[df2]

In [14]:
df

Unnamed: 0,sigmaritan,artistName,trackName,msPlayed,duration
0,alex_clefos,Shinedown,My Name (Wearing Me Out),83780,216187.0
1,alex_clefos,Disturbed,Deify,1440453,256573.0
2,alex_clefos,Disturbed,Guarded,1776,200293.0
3,alex_clefos,Disturbed,Deify,1002243,256573.0
4,alex_clefos,Disturbed,Guarded,1646,200293.0
...,...,...,...,...,...
351885,vladimir_stojoc,Lil Nas X,INDUSTRY BABY (feat. Jack Harlow),212352,212000.0
351886,vladimir_stojoc,bbno$,resume,164426,164427.0
351887,vladimir_stojoc,YUNGBLUD,braindead!,163293,163293.0
351888,vladimir_stojoc,Rich Brian,Getcho Mans,211306,211307.0


In [15]:
#Selecting only the tracks that were listened to almost entirely (>=0.9 of duration)
df = df.loc[df['msPlayed'] / df['duration'] >= 0.9]
df

Unnamed: 0,sigmaritan,artistName,trackName,msPlayed,duration
1,alex_clefos,Disturbed,Deify,1440453,256573.0
3,alex_clefos,Disturbed,Deify,1002243,256573.0
5,alex_clefos,Disturbed,Deify,664190,256573.0
7,alex_clefos,Disturbed,Indestructible,543770,278027.0
8,alex_clefos,Disturbed,Indestructible,564040,278027.0
...,...,...,...,...,...
351885,vladimir_stojoc,Lil Nas X,INDUSTRY BABY (feat. Jack Harlow),212352,212000.0
351886,vladimir_stojoc,bbno$,resume,164426,164427.0
351887,vladimir_stojoc,YUNGBLUD,braindead!,163293,163293.0
351888,vladimir_stojoc,Rich Brian,Getcho Mans,211306,211307.0


In [16]:
#Soring the dataframe in descending order by miliseconds
sorted_df = df.sort_values(['duration'], ascending=[False])

In [17]:
#function to transform miliseconds in minutes
def ms_to_min(x):
    return x/60000

In [18]:
#applying the function
sorted_df['minutes'] = sorted_df['duration'].apply(ms_to_min)

In [19]:
#Reseting index
sorted_df.reset_index(inplace = True, drop = True)

In [20]:
#Removing duplicates of songs by same listener
sorted_df = sorted_df[~pd.DataFrame(np.sort(sorted_df[['sigmaritan','trackName', 'artistName']], axis=1)).duplicated(keep='first')]

## These are  the longest tracks listened to

In [21]:
sorted_df.head(15)

Unnamed: 0,sigmaritan,artistName,trackName,msPlayed,duration,minutes
0,marius_sclearuc,Godspeed You! Black Emperor,Storm,1303115,1352413.0,22.540217
5,marius_sclearuc,Rush,2112: Overture / The Temples Of Syrinx / Disco...,1233666,1233667.0,20.561117
6,andreea_covalevschi,Buckethead,The Left Panel,1106050,1157700.0,19.295
7,marius_sclearuc,MONO,Requiem for Hell,1068453,1068453.0,17.80755
8,marius_sclearuc,We Lost The Sea,The Last Dive of David Shaw,967356,1016600.0,16.943333
10,marius_sclearuc,Yndi Halda,Dash And Blast,1447224,1011280.0,16.854667
11,andreea_covalevschi,Iced Earth,Dante's Inferno,986826,986827.0,16.447117
12,andreea_covalevschi,TOOL,7empest,900584,943529.0,15.725483
27,marius_sclearuc,The Seven Mile Journey,Through The Alter Ego Justifications,927653,927653.0,15.460883
28,marius_sclearuc,We Lost The Sea,Towers,903680,903680.0,15.061333


In [22]:
#Grouping by sigmaritan and taking the max values for miliseconds
x = sorted_df.groupby(['sigmaritan'])['duration'].transform(max) == sorted_df['duration']
longest_tracks = sorted_df[x]

In [23]:
#Reseting index
longest_tracks.reset_index(inplace = True, drop = True)

## These are the longest tracks listened by every sigmaritan

In [24]:
longest_tracks

Unnamed: 0,sigmaritan,artistName,trackName,msPlayed,duration,minutes
0,marius_sclearuc,Godspeed You! Black Emperor,Storm,1303115,1352413.0,22.540217
1,andreea_covalevschi,Buckethead,The Left Panel,1106050,1157700.0,19.295
2,elena_timbur,Franz Schubert,"6 Grandes Marches, Op. 40, D. 819: No. 5 in E-...",869909,870120.0,14.502
3,nicolae_gherman,Richard Wagner,Tannhäuser: Overture,853826,853827.0,14.23045
4,vasile_papaluta,Fearless Motivation,Finding Your Purpose,821033,821034.0,13.6839
5,irina_tiora,Pink Floyd,"Shine On You Crazy Diamond, Pts. 1-5 - 2011 Re...",735642,810293.0,13.504883
6,arina_coroliuc,Johann Strauss II,"An der schönen blauen Donau, Op. 314",707173,707173.0,11.786217
7,vladimir_stojoc,Oxxxymiron,КТО УБИЛ МАРКА?,567000,567001.0,9.450017
8,alex_clefos,Black Sabbath,God Is Dead?,532266,532267.0,8.871117
9,eduard_balamatiuc,Ed Sheeran,Give Me Love,526386,526387.0,8.773117


In [61]:
#Loading again the original dataframe
df = pd.read_csv('Sigmoid_Spotify_Data_2.csv')

In [62]:
#Dataframe with songs played at most 30 seconds
skipped = df.loc[df['msPlayed'] <= 30000]

In [63]:
#Grouping by sigmaritan
skipped_counts = skipped.groupby('sigmaritan').count()
nr_of_tracks = df.groupby('sigmaritan').count()

In [64]:
#Saving the number of skips
skipped_counts['percentage of skipping'] = skipped_counts['msPlayed'] / nr_of_tracks['msPlayed'] * 100

In [65]:
#Removing unnecessary columns
skipped_counts = skipped_counts.drop(['Unnamed: 0', 'endTime','artistName', 'trackName', 'msPlayed'], axis = 1)

In [66]:
#Sorting in descending order
sorted_skippes = skipped_counts.sort_values(['percentage of skipping'], ascending=[False])

## This dataframe represents the percentage of skips by sigmaritan

In [67]:
sorted_skippes

Unnamed: 0_level_0,percentage of skipping
sigmaritan,Unnamed: 1_level_1
nicolae_dubenco,71.928434
eduard_balamatiuc,51.299723
nicolae_gherman,46.708083
irina_tiora,45.45063
marius_purici,42.430761
marius_sclearuc,35.422297
andreea_covalevschi,34.241793
denis_smocvin,32.49497
alex_clefos,22.903701
arina_coroliuc,22.361131


In [68]:
#Saving results
sorted_skippes.to_csv('percentage_of_skipping.csv', index = True)
longest_tracks.to_csv('longest_tracks_by_sigmaritan.csv', index = True)
sorted_df.to_csv('15_longest.csv', index = True)