<a href="https://colab.research.google.com/github/luckyswaminathan/MusicVibe/blob/main/gloVeNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
## imports
## gloVe credits: 
## Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation. [pdf] [bib]
## am adapting structure given by https://www.youtube.com/watch?v=e0WW5w13V64&t=68s&ab_channel=GregHogg to create 
## an NLP model for my app Music Vibe (data source requires no credits as is open source)
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [4]:
url = 'https://raw.githubusercontent.com/luckyswaminathan/MusicVibe/main/tweet_emotions.csv'

msc_df = pd.read_csv(url)

def custencoder(df):
  df.replace(to_replace="anger", value=0, inplace=True)
  df.replace(to_replace="boredom", value=1, inplace=True)
  df.replace(to_replace="empty", value=1, inplace=True)
  df.replace(to_replace="neutral", value=1, inplace=True)
  df.replace(to_replace="enthusiasm", value=2, inplace=True)
  df.replace(to_replace="fun", value=2, inplace=True)
  df.replace(to_replace="happiness", value=2, inplace=True)
  df.replace(to_replace="hate", value=0, inplace=True)
  df.replace(to_replace="love", value=2, inplace=True)
  df.replace(to_replace="relief", value=2, inplace=True)
  df.replace(to_replace="sadness", value=0, inplace=True)
  df.replace(to_replace="surprise", value=2, inplace=True)
  df.replace(to_replace="worry", value=0, inplace=True)

custencoder(msc_df['sentiment'])

msc_df['sentiment'].value_counts()

2    15299
0    15057
1     9644
Name: sentiment, dtype: int64

In [6]:
## using gloVE rather than word2vec because works better with small dataset

!wget http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip

--2023-06-09 00:18:01--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-06-09 00:20:43 (5.09 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [9]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [10]:
words = dict()
def add_to_dict(d, filename):
  with open(filename, 'r') as f:
    for line in f.readlines():
      line = line.split(' ')
      d[line[0]] = np.array(line[1:], dtype=float)

add_to_dict(words, 'glove.6B.50d.txt')
len(words)

400000

In [11]:
import nltk as nl
nl.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
tokenizer = nl.RegexpTokenizer(r"\w+")

## LEMMATIZATION -- concatenating variants of a word to one form

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()


## tokenizes and lemmatizes all words that are in gloVe word set
def tokenLem(s):
  tokens = tokenizer.tokenize(s)
  tokens = [word.lower() for word in tokens]
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  tokens = [word for word in tokens if word in words]

  return tokens


In [13]:
def message_to_word_vectors(message, word_dict=words):
  processed_list_of_tokens = tokenLem(message)

  vectors = []

  for token in processed_list_of_tokens:
    if token not in word_dict:
      continue
    
    token_vector = word_dict[token]
    vectors.append(token_vector)
  
  return np.array(vectors, dtype=float)

  

In [14]:
## train_test_split

train_df, mix_df = train_test_split(msc_df, random_state=42, test_size = 0.3)

val_df, test_df = train_test_split(mix_df, random_state=42, test_size = 0.5)


len(train_df), len(val_df), len(test_df)


(28000, 6000, 6000)

In [15]:
## splitting train,val, test into sentiment and content 


def df_to_SC(dff):
  ## sentiment for sentence
  y = dff['sentiment'].to_numpy().astype(int)

  ## content array
  all_word_vector_sequences = []

  for message in dff['content']:
    message_as_vector_seq = message_to_word_vectors(message)
    
    if message_as_vector_seq.shape[0] == 0:
      message_as_vector_seq = np.zeros(shape=(1, 50))

    all_word_vector_sequences.append(message_as_vector_seq)
  
  return all_word_vector_sequences, y




In [16]:
word_train, feel_train = df_to_SC(train_df)

In [17]:
## making the df into a np array

from copy import deepcopy

def pad_X(X, desired_sequence_length=57):
  X_copy = deepcopy(X)

  for i, x in enumerate(X):
    x_seq_len = x.shape[0]
    sequence_length_difference = desired_sequence_length - x_seq_len
    
    pad = np.zeros(shape=(sequence_length_difference, 50))

    X_copy[i] = np.concatenate([x, pad])
  
  return np.array(X_copy).astype(float)

In [18]:
word_train = pad_X(word_train)

word_train.shape


word_val, feel_val = df_to_SC(val_df)
word_val = pad_X(word_val)

word_test, feel_test = df_to_SC(test_df)
word_test = pad_X(word_test)




In [19]:
## LSTM model

from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

model = Sequential([])

model.add(layers.Input(shape=(57, 50)))
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))


In [20]:
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from tensorflow.keras.callbacks import ModelCheckpoint

cp = ModelCheckpoint('model/', save_best_only=True)

model.compile(optimizer=Adam(learning_rate=0.000001), 
              loss=BinaryCrossentropy(), 
              metrics=['accuracy', AUC(name='auc')])

In [21]:
frequencies = pd.value_counts(train_df['sentiment']) 
frequencies
## weighting loss function as datset imbalanced

weights = {0: frequencies.sum() / frequencies[0], 1: frequencies.sum() / frequencies[1], 2: frequencies.sum() / frequencies[2]}
weights

{0: 2.6465028355387523, 1: 4.163568773234201, 2: 2.6180458158017763}

In [None]:
model.fit(word_train, feel_train, validation_data=(word_val, feel_val), epochs=20, callbacks=[cp], class_weight=weights)

Epoch 1/20



Epoch 2/20



Epoch 3/20



Epoch 4/20



Epoch 5/20



Epoch 6/20

In [23]:
from tensorflow.keras.models import load_model

best_model = load_model('model/')

In [24]:
test_predictions = (best_model.predict(word_test) > 0.5).astype(int)

from sklearn.metrics import classification_report

print(classification_report(feel_test, test_predictions))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2218
           1       0.25      1.00      0.40      1493
           2       0.00      0.00      0.00      2289

    accuracy                           0.25      6000
   macro avg       0.08      0.33      0.13      6000
weighted avg       0.06      0.25      0.10      6000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
