In [2]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(891)
from nltk.corpus import stopwords
import re
import scipy.sparse
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
data = pd.read_csv('spotify_dataset.csv', sep= ',')

data.head()

tracks = data[["track","popularity"]]




#print(tracks.shape)

In [4]:
stop = set(stopwords.words('english'))
#Create a stemmer
stemmer = SnowballStemmer(language = 'english')
#Create a lemmatizer
lemma = WordNetLemmatizer()
#Stem and lemmatize a term
def lemmatize_stemming(term):
    term = lemma.lemmatize(term, pos='v') # Lemmatize
    term = stemmer.stem(term) #Stem
    return term

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text): #Deaccents and splits terms
        token = re.sub("\W","",token) #Remove special characters, punctuation, spaces

        token = token.lower() #lowercase string

        if token in stop: # Stopword removal: Remove "token not in stop" to keep stopwords
            token = ""

        if token.startswith("http"): #entity recognition of URLs.
            token = "URL_"

        if len(token) > 3:
            result.append(token)
    return result
# These will be applicable because the corpus contains special characters, punctuation, spaces, urls and accent
# terms

In [5]:
processed_docs = tracks['track'].map(preprocess)

processed_docs.head()

0    [jealous, kind, fella]
1                [initials]
2           [melody, twist]
3             [bomba, sonó]
4            [uravu, solla]
Name: track, dtype: object

In [6]:
# Create a dictionary – word and its frequency in all documents
dictionary = gensim.corpora.Dictionary(processed_docs)
# Filter out infrequent terms appearing less than N times (no_below=N),
# terms appearing in more than 50% of documents (no_above=0.5), and keep
# only the top 100,000 terms (keep_n=100000)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
# Convert dictionary to document – bag of words matrix
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] #list of lists
#convert the bag of words list of lists to a sparse matrix
term_doc_matrix = gensim.matutils.corpus2csc(bow_corpus)
doc_term_matrix = term_doc_matrix.transpose()
print(doc_term_matrix.shape)

(41099, 2536)


In [7]:
df = pd.DataFrame(doc_term_matrix.toarray().astype('int32'),columns=dictionary.values())

df.head()

Unnamed: 0,jealous,kind,melody,twist,bomba,beat,nota,note,samba,days,...,garage,spiegel,balamurali,chebika,claudio,binaural,sinus,graspop,asle,attention
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
from sklearn.preprocessing import StandardScaler
# Seperate Dependent and Independent variables to be split
y = tracks.iloc[:,1]
x = df
# Peform Split with validation size 40% and training 60%
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state=42)
print('Features before PCA: {}'.format(X_train.shape[1]))
# Initialize scaler, fit/transform train data and transform test data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Initialize PCA to peforme dimension reduction on the current 445 features
pca = PCA(n_components = 500)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


Features before PCA: 2536


In [30]:
model = LogisticRegression(solver = "lbfgs", max_iter=500)
model.fit(X_train_pca, y_train)

LogisticRegression(max_iter=500)

In [31]:
y_pred = model.predict(X_test_pca)
score = accuracy_score(y_test, y_pred)
print("Accuracy score: {}%".format(round(score*100)))

Accuracy score: 71%
