## **Importing libraries and creating model score functions**

In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import _stop_words
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report,recall_score,precision_score,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


## **Import functions from spotify_functions.py**

In [20]:
from spotify_functions import model_score,nn_model_score

## **Import dataset of clean lyrics and gender target variable**

In [21]:
df_predict = pd.read_csv('csv_to_predict.csv')

df_predict = df_predict.iloc[:,1:]

df_predict.drop('lyrics',axis=1,inplace=True)

df_predict.columns = ['gender','lyrics']

## **Tokenize and split data into train and test**

In [22]:
x = df_predict['lyrics']
y = list(df_predict['gender'])

from keras.preprocessing.text import Tokenizer

tk = Tokenizer(
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{"}~\t\n',
               lower=True,
               char_level=False,
               split=' ')
tk.fit_on_texts(x)


X_train,X_test,y_train,y_test = train_test_split(x,y,random_state=42)

## **Understand our features**

In [23]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

tfidf_wm = tfidf_vectorizer.fit_transform(X_train)

tfidf_tokens = tfidf_vectorizer.get_feature_names()

df_countvect = pd.DataFrame(data=tfidf_wm.toarray(),columns = tfidf_tokens)

df_countvect



Unnamed: 0,aa,aaaaahhhhh,aaaahhhhh,aaah,aah,aaherra,aaliyah,aamu,aandt,aao,...,zoomin,zoot,zorro,zot,zotnyzor,zoyd,zulu,zuma,zumschloss,zyngarettes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Vectorizing with Count Vectorizer and TFIDF**

In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

c_vectorizer = CountVectorizer()
X_train_vect_c = c_vectorizer.fit_transform(X_train)
X_test_vect_c = c_vectorizer.transform(X_test)


t_vectorizer = TfidfVectorizer()
X_train_vect_t = t_vectorizer.fit_transform(X_train)
X_test_vect_t = t_vectorizer.transform(X_test)


## **Count vectorizer + Decision Tree**

In [25]:
from sklearn.tree import DecisionTreeClassifier

import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import confusion_matrix,classification_report,recall_score,precision_score,accuracy_score

dt_count = DecisionTreeClassifier()
dt_count.fit(X_train_vect_c,y_train)


model_score(dt_count,X_test_vect_c,X_train_vect_c,y_test,y_train)

Train Accuracy: 1.0
Test Accuracy: 0.5536105032822757
              precision    recall  f1-score   support

           0       0.51      0.47      0.49       207
           1       0.59      0.62      0.60       250

    accuracy                           0.55       457
   macro avg       0.55      0.55      0.55       457
weighted avg       0.55      0.55      0.55       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,97,110
Actual Men,94,156


**Observations**: Severe case of overfitting when using the base decision tree algorithm.

## **Count vectorizer + Random Forest**

In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

rf_count = RandomForestClassifier()
rf_count.fit(X_train_vect_c,y_train)

model_score(rf_count,X_test_vect_c,X_train_vect_c,y_test,y_train)

Train Accuracy: 1.0
Test Accuracy: 0.6367614879649891
              precision    recall  f1-score   support

           0       0.66      0.41      0.51       207
           1       0.63      0.82      0.71       250

    accuracy                           0.64       457
   macro avg       0.64      0.62      0.61       457
weighted avg       0.64      0.64      0.62       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,85,122
Actual Men,44,206


**Observations**: We are dealing with another severe case of overfititng when using the basic random forest algorithm.

## **TFIDF Vectorizer + Tuned Random Forest**

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut


rf_tfidf = RandomForestClassifier()
rf_tfidf.fit(X_train_vect_t,y_train)

cv = LeaveOneOut()


param_grid = { 
    'n_estimators': [2,3,8],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [1,5,10],
    'criterion' :['gini', 'entropy']
}
rfc=RandomForestClassifier(random_state=42)
rf_tuned = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=cv,verbose=0)
rf_tuned.fit(X_train_vect_t, y_train)

model_score(rf_tfidf,X_test_vect_t,X_train_vect_t,y_test,y_train)

Train Accuracy: 1.0
Test Accuracy: 0.6236323851203501
              precision    recall  f1-score   support

           0       0.62      0.43      0.51       207
           1       0.62      0.79      0.70       250

    accuracy                           0.62       457
   macro avg       0.62      0.61      0.60       457
weighted avg       0.62      0.62      0.61       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,88,119
Actual Men,53,197


## **TFIDF + Logistic Regression**

In [28]:
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train_vect_t, y_train)

model_score(logisticRegr,X_test_vect_t,X_train_vect_t,y_test,y_train)

Train Accuracy: 0.8358862144420132
Test Accuracy: 0.6083150984682714
              precision    recall  f1-score   support

           0       0.59      0.44      0.51       207
           1       0.62      0.74      0.68       250

    accuracy                           0.61       457
   macro avg       0.60      0.59      0.59       457
weighted avg       0.61      0.61      0.60       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,92,115
Actual Men,64,186


## **TFIDF + Logistic Regression Tuned**

In [29]:
grid_values = {'penalty': ['l2'], 'C': [0.5,1,1.2,1.5,2,5]}
lr_tuned = GridSearchCV(LogisticRegression(solver='lbfgs',max_iter=500), param_grid=grid_values,refit=True,verbose=1)
lr_tuned.fit(X_train_vect_t,y_train)

# We've increased the number of iterations because the solver wouldn't converge.

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [30]:
model_score(lr_tuned,X_test_vect_t,X_train_vect_t,y_test,y_train)

Train Accuracy: 0.8358862144420132
Test Accuracy: 0.6083150984682714
              precision    recall  f1-score   support

           0       0.59      0.44      0.51       207
           1       0.62      0.74      0.68       250

    accuracy                           0.61       457
   macro avg       0.60      0.59      0.59       457
weighted avg       0.61      0.61      0.60       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,92,115
Actual Men,64,186


## **TFIDF + SVM**

In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

svm_tfidf = make_pipeline(StandardScaler(with_mean=False),SVC(gamma='auto'))
svm_tfidf.fit(X_train_vect_t,y_train)

model_score(svm_tfidf,X_test_vect_t,X_train_vect_t,y_test,y_train)

Train Accuracy: 0.7950401167031363
Test Accuracy: 0.5448577680525164
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       207
           1       0.55      1.00      0.71       250

    accuracy                           0.54       457
   macro avg       0.27      0.50      0.35       457
weighted avg       0.30      0.54      0.39       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,0,207
Actual Men,1,249


**Observations**: The SVM severely overpredicts men in its predictions, as shown by the confusion matrix.

## **TFIDF + SVM tuned**

In [32]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

svm_tuned = GridSearchCV(SVC(),param_grid,refit=True,verbose=1)
svm_tuned.fit(X_train_vect_t,y_train)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [33]:
model_score(svm_tuned,X_test_vect_t,X_train_vect_t,y_test,y_train)

Train Accuracy: 0.8796498905908097
Test Accuracy: 0.6280087527352297
              precision    recall  f1-score   support

           0       0.60      0.53      0.56       207
           1       0.65      0.71      0.68       250

    accuracy                           0.63       457
   macro avg       0.62      0.62      0.62       457
weighted avg       0.63      0.63      0.63       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,110,97
Actual Men,73,177


## **Cross validation - 5 folds**

In [34]:
from sklearn.model_selection import cross_validate

models = [dt_count,rf_count,rf_tfidf,svm_tfidf,svm_tuned,rf_tuned]


score=['accuracy','f1','precision','recall']

acc_list = []
f1_list = []
prec_list = []
rec_list = []


vect = CountVectorizer()

x_vect = vect.fit_transform(x)

for model in models:
    res = cross_validate(model,x_vect,y,scoring = score,cv=5)
    acc = (res['test_accuracy'].mean())
    f1 = (res['test_f1'].mean())
    prec =(res['test_precision'].mean())
    rec =(res['test_recall'].mean())
    acc_list.append(acc)
    f1_list.append(f1)
    prec_list.append(prec)
    rec_list.append(rec)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [35]:
score=['accuracy','f1','precision','recall']


cross_val_df = pd.DataFrame({'Models': models,'Accuracy':acc_list,'F1':f1_list,'Precision':prec_list,'Recall':rec_list})

cross_val_df

Unnamed: 0,Models,Accuracy,F1,Precision,Recall
0,DecisionTreeClassifier(),0.547041,0.590875,0.579646,0.603228
1,"(DecisionTreeClassifier(max_features='sqrt', r...",0.603924,0.670275,0.609606,0.746221
2,"(DecisionTreeClassifier(max_features='sqrt', r...",0.609376,0.677567,0.613583,0.757261
3,"(StandardScaler(with_mean=False), SVC(gamma='a...",0.544309,0.704505,0.543813,1.0
4,"GridSearchCV(estimator=SVC(),\n pa...",0.53882,0.586962,0.565466,0.628369
5,"GridSearchCV(cv=LeaveOneOut(),\n e...",0.551399,0.64749,0.565353,0.764474


**Observation**: The following table helps us clear any misinterpretation regarding the randomization effect of our dataset splitting. As we can see, all our models seem to predict poorly when using cross validation, even though we had some models generating better results on the test set.

## **Creating a Stemmed Dataset**

In [36]:
df_stemmed = df_predict.copy()


from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df_stemmed['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_stemmed['lyrics']] 


from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# Get the stemmed_tokens
df_stemmed['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df_stemmed['tokenized_text'] ]

In [37]:
df_stemmed['stemmed_tokens'] = df_stemmed['stemmed_tokens'].astype(str)
df_stemmed

Unnamed: 0,gender,lyrics,tokenized_text,stemmed_tokens
0,1,"['when', 'your', 'legs', ""don't"", 'work', 'lik...","[when, your, legs, don, work, like, they, used...","['when', 'your', 'leg', 'don', 'work', 'like',..."
1,1,"[""i'm"", 'gonna', 'pick', 'up', 'the', 'pieces'...","[gonna, pick, up, the, pieces, and, build, leg...","['gonna', 'pick', 'up', 'the', 'piec', 'and', ..."
2,1,"['white', 'lips,', 'pale', 'face', 'breathing'...","[white, lips, pale, face, breathing, in, the, ...","['white', 'lip', 'pale', 'face', 'breath', 'in..."
3,1,"['i', 'was', 'so', 'high', 'i', 'did', 'not', ...","[was, so, high, did, not, recognize, the, fire...","['wa', 'so', 'high', 'did', 'not', 'recogn', '..."
4,1,"['may', 'i', 'have', 'your', 'attention,', 'pl...","[may, have, your, attention, please, may, have...","['mai', 'have', 'your', 'attent', 'pleas', 'ma..."
...,...,...,...,...
1823,0,"['common', 'love', ""isn't"", 'for', 'us', 'we',...","[common, love, isn, for, us, we, created, some...","['common', 'love', 'isn', 'for', 'us', 'we', '..."
1824,0,"['i', ""didn't"", 'ask', 'for', 'a', 'free', 'ri...","[didn, ask, for, free, ride, only, asked, you,...","['didn', 'ask', 'for', 'free', 'ride', 'onli',..."
1825,0,"['day', 'to', 'night', 'to', 'morning,', 'keep...","[day, to, night, to, morning, keep, with, me, ...","['dai', 'to', 'night', 'to', 'morn', 'keep', '..."
1826,0,"['maybe', ""it's"", 'the', 'way', 'you', 'say', ...","[maybe, it, the, way, you, say, my, name, mayb...","['mayb', 'it', 'the', 'wai', 'you', 'sai', 'my..."


In [38]:
x_stem = df_stemmed['stemmed_tokens']
y_stem = list(df_stemmed['gender'])
X_train_stem,X_test_stem,y_train_stem,y_test_stem = train_test_split(x_stem,y_stem,random_state=42)

## **Stemmed Dataset + TFIDF + SVM**

In [39]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

vectorizer = TfidfVectorizer()
X_train_vect_s = vectorizer.fit_transform(X_train_stem)
X_test_vect_s = vectorizer.transform(X_test_stem)

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

svm_tuned_s = GridSearchCV(SVC(),param_grid,refit=True,verbose=1)
svm_tuned_s.fit(X_train_vect_s,y_train)

model_score(svm_tuned_s,X_test_vect_s,X_train_vect_s,y_test,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Train Accuracy: 0.8701677607585704
Test Accuracy: 0.6389496717724289
              precision    recall  f1-score   support

           0       0.61      0.55      0.58       207
           1       0.66      0.71      0.68       250

    accuracy                           0.64       457
   macro avg       0.63      0.63      0.63       457
weighted avg       0.64      0.64      0.64       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,114,93
Actual Men,72,178


**Observations**: The stemmed dataset doesn't seem to have an impact on the predictive power of our models.

## **TFIDF + Sequential Neural Network**

In [40]:
def nn_model_score(model,x_test,x_train,y_test,y_train):
    print('Train Results:')
    print(model.evaluate(x_train,np.asarray(y_train)))
    print('Test Results:')
    print(model.evaluate(x_test,np.asarray(y_test)))

In [41]:
import tensorflow as tf

model_1 = tf.keras.Sequential(tf.keras.layers.Dense(1))

model_1.compile(loss = tf.keras.losses.BinaryCrossentropy(),

                    optimizer = tf.keras.optimizers.SGD(),

                    metrics = ['accuracy'])


train_y_nump = np.asarray(y_train)


model_1.fit(X_train_vect_t,train_y_nump,epochs = 200,verbose=0)

2023-01-09 17:19:38.116009: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


<keras.callbacks.History at 0x17e058820>

In [42]:
nn_model_score(model_1,X_test_vect_t,X_train_vect_t,y_test,y_train)

Train Results:
[0.48423969745635986, 0.797957718372345]
Test Results:
[0.6581444144248962, 0.6105032563209534]


## **Neural Network - Model 2**

In [43]:
tf.random.set_seed(40)

model_2 = tf.keras.Sequential([

  tf.keras.layers.Dense(100), # add 100 dense neurons

  tf.keras.layers.Dense(10), # add another layer with 10 neurons

  tf.keras.layers.Dense(1)

])

model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),

                optimizer=tf.keras.optimizers.Adam(), 

                metrics=['accuracy'])

model_2.fit(X_train_vect_t,train_y_nump, epochs=100, verbose=0)



<keras.callbacks.History at 0x2e12dd880>

In [44]:
nn_model_score(model_2,X_test_vect_t,X_train_vect_t,y_test,y_train)

Train Results:
[0.033752623945474625, 0.9978117942810059]
Test Results:
[4.522022724151611, 0.5929977893829346]


## **Neural Network - Model 3**

In [45]:
model_3 = tf.keras.Sequential([

                               tf.keras.layers.Dense(10, activation = 'relu'),

                               tf.keras.layers.Dense(4, activation = 'relu'),

                               tf.keras.layers.Dense(1, activation = 'sigmoid')

])

model_3.compile( loss= tf.keras.losses.binary_crossentropy,

                optimizer = tf.keras.optimizers.Adam(lr = 0.01),

                metrics = ['accuracy'])

model_3.fit(X_train_vect_t, train_y_nump, epochs = 25, verbose = 0)

  super().__init__(name, **kwargs)


<keras.callbacks.History at 0x2e228a430>

In [46]:
nn_model_score(model_3,X_test_vect_t,X_train_vect_t,y_test,y_train)

Train Results:
[0.0996043011546135, 0.9759299755096436]
Test Results:
[1.6593981981277466, 0.6301969289779663]


## **Conclusion**: 
In conclusion, most of our models had severe cases of overfitting. When trying to tune the parameters to reduce the overfitting, the models weren't able to generalise well on the validation set. In fact, our results seem to indicate that the lyrics aren't a strong predictor to assess the gender of an artist. This could be the case for many reason, such as the fact that most songs often contains sounds rather than words. Furthermore, the lyrics of songs released in the more recent years often contain less lyrics, making it harder for our algorithm to generate a robust prediction. 

To further our analysis, we could try to analyse the emotions contained in the lyrics of a song, and create a predictive model in such a way that the prediction is based off the emotions of the lyrics.