In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from sklearn.preprocessing import MinMaxScaler

In [12]:
# importing data
all_words_df = pd.read_csv("/Users/iman/CPSC_Courses/CPSC393/CPSC393Final/Data/all_words.csv")
all_words_df.head()

Unnamed: 0,word,type
0,peeps,slang
1,compo,slang
2,hasbian,slang
3,mozzie,slang
4,soused,slang


In [13]:
years_2010_to_2019_df = pd.read_csv("/Users/iman/CPSC_Courses/CPSC393/CPSC393Final/Data/2010_to_2019.csv")
years_2010_to_2019_df.head()

Unnamed: 0,release_date,track_name,artist_name,album_name,lyrics,track_popularity,artist_popularity
0,2010,Atmosphere - 2010 Remastered Version,Joy Division,Substance,it s early morning the sun comes out las...,51.0,66
1,2010,My Best Days Are Ahead Of Me,Danny Gokey,My Best Days,blowing out the candles on another birthday...,38.0,57
2,2010,El Amor Que Perdimos,Prince Royce,Number 1's,letra de el amor que perdimos anoche so...,54.0,82
3,2010,Carry on Wayward Son,Kansas,The Essential Kansas,carry on my wayward son there ll be peace...,65.0,68
4,2010,Aves de Agua,Aaron Cruz Trío,Eco,99,24.0,16


In [14]:
def preprocess_input(sentence):
  sentence = sentence.lower()
  sentence = remove_stopwords(sentence)
  sentence = simple_preprocess(sentence)
  return sentence

years_2010_to_2019_df['lyrics'] = years_2010_to_2019_df['lyrics'].apply(lambda x: preprocess_input(x))
years_2010_to_2019_df.head(2)

Unnamed: 0,release_date,track_name,artist_name,album_name,lyrics,track_popularity,artist_popularity
0,2010,Atmosphere - 2010 Remastered Version,Joy Division,Substance,"[early, morning, sun, comes, night, shaking, p...",51.0,66
1,2010,My Best Days Are Ahead Of Me,Danny Gokey,My Best Days,"[blowing, candles, birthday, cake, old, look, ...",38.0,57


In [15]:
x_train, x_test, y_train, y_test = train_test_split(years_2010_to_2019_df['lyrics'], years_2010_to_2019_df['track_popularity'], test_size=0.2)

In [24]:
x_train.head()

5285    [sé, parar, al, viento, mis, hélices, los, hur...
7135                                                   []
347     [want, lover, like, want, friend, stay, end, o...
1312    [jag, hade, drömmar, shit, vad, jag, hade, drö...
2275    [cannon, cannon, know, walk, racks, bad, bitch...
Name: lyrics, dtype: object

In [17]:
cv = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
X_train = cv.fit_transform(x_train).toarray()
X_test = cv.transform(x_test).toarray()
unique_words = []
for sentence in x_train:
  for word in sentence:
    if word not in unique_words:
      unique_words.append(word)
print('Number of unique words available: ', len(unique_words))
print('Shape of train data: ', X_train.shape)
print('Shape of test data: ', X_test.shape)

Number of unique words available:  77066
Shape of train data:  (5834, 77066)
Shape of test data:  (1459, 77066)


In [25]:
x_train

5285    [sé, parar, al, viento, mis, hélices, los, hur...
7135                                                   []
347     [want, lover, like, want, friend, stay, end, o...
1312    [jag, hade, drömmar, shit, vad, jag, hade, drö...
2275    [cannon, cannon, know, walk, racks, bad, bitch...
                              ...                        
809                                                    []
6855    [gjør, det, deg, gjør, det, meg, gjør, det, os...
303     [somebody, longing, hope, turns, ll, watch, li...
1860    [te, vayan, culpar, te, vayan, culpar, botelli...
4052    [feel, coming, static, won, open, hands, fall,...
Name: lyrics, Length: 5834, dtype: object

In [None]:
unique_words = list(set(unique_words)) # only count the number of unique words
print(len(unique_words))
print(unique_words)

In [20]:
tokenizer_obj = Tokenizer()
total_reviews = x_train.append(x_test)
tokenizer_obj.fit_on_texts(total_reviews)

max_length = max([len(s) for s in total_reviews])

vocab_size = len(tokenizer_obj.word_index) + 1

x_train_tokens = tokenizer_obj.texts_to_sequences(x_train)
x_test_tokens = tokenizer_obj.texts_to_sequences(x_test)

x_train_pad = pad_sequences(x_train_tokens, maxlen=max_length, padding='post')
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_length, padding='post')

In [22]:
# Create Model
model = Sequential()

model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(LSTM(32))
model.add(Dense(1, activation='relu'))

model.summary()

model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
model.fit(x_train_pad, y_train, batch_size=32, epochs=10, validation_data=(x_test_pad, y_test))

2022-04-26 18:28:40.621193: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 7095, 100)         8810300   
                                                                 
 lstm (LSTM)                 (None, 32)                17024     
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 8,827,357
Trainable params: 8,827,357
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 