## **Importing libraries and creating model score functions**

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import _stop_words
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import confusion_matrix,classification_report,recall_score,precision_score,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

def model_score(model,x):
	preds = model.predict(x)
	cmat = confusion_matrix(y_test, preds)
	print(f'Accuracy: {accuracy_score(y_test, preds)}')
	print(classification_report(y_test, preds))
	return pd.DataFrame(cmat, columns=['Predicted ' + str(i) for i in ['Women','Men']],\
					index=['Actual ' + str(i) for i in ['Women','Men']])

## **Import dataset of clean lyrics and gender target variable**

In [12]:
df_predict = pd.read_csv('csv_to_predict.csv')

df_predict = df_predict.iloc[:,1:]

## **Split data into train and test**

In [13]:
x = list(df_predict['lyrics'])
y = list(df_predict['gender'])
X_train,X_test,y_train,y_test = train_test_split(x,y,random_state=42)

## **Count vectorizer with a decision tree**

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier

vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

dt_count = DecisionTreeClassifier()
dt_count.fit(X_train_vect,y_train)

In [5]:
model_score(dt_count,X_test_vect)

Accuracy: 0.5864332603938731
              precision    recall  f1-score   support

           0       0.55      0.50      0.52       207
           1       0.61      0.66      0.64       250

    accuracy                           0.59       457
   macro avg       0.58      0.58      0.58       457
weighted avg       0.58      0.59      0.58       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,103,104
Actual Men,85,165


## **Count vectorizer with random forest**

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

rf_count = RandomForestClassifier()
rf_count.fit(X_train_vect,y_train)

In [7]:
model_score(rf_count,X_test_vect)

Accuracy: 0.6214442013129103
              precision    recall  f1-score   support

           0       0.63      0.40      0.49       207
           1       0.62      0.80      0.70       250

    accuracy                           0.62       457
   macro avg       0.62      0.60      0.59       457
weighted avg       0.62      0.62      0.60       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,83,124
Actual Men,49,201


## **TFIDF Vectorizer with Random Forest**

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

rf_tfidf = RandomForestClassifier()
rf_tfidf.fit(X_train_vect,y_train)

In [9]:
model_score(rf_tfidf,X_test_vect)

Accuracy: 0.6258205689277899
              precision    recall  f1-score   support

           0       0.64      0.39      0.49       207
           1       0.62      0.82      0.71       250

    accuracy                           0.63       457
   macro avg       0.63      0.61      0.60       457
weighted avg       0.63      0.63      0.61       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,81,126
Actual Men,45,205


## **TFIDF + SVM**

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

svm_tfidf = make_pipeline(StandardScaler(with_mean=False),SVC(gamma='auto'))
svm_tfidf.fit(X_train_vect,y_train)

In [11]:
model_score(svm_tfidf,X_test_vect)

Accuracy: 0.5448577680525164
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       207
           1       0.55      1.00      0.71       250

    accuracy                           0.54       457
   macro avg       0.27      0.50      0.35       457
weighted avg       0.30      0.54      0.39       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,0,207
Actual Men,1,249


In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

svm_tuned = GridSearchCV(SVC(),param_grid,refit=True,verbose=1)
svm_tuned.fit(X_train_vect,y_train)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [17]:
model_score(svm_tuned,X_test_vect)

Accuracy: 0.6061269146608315
              precision    recall  f1-score   support

           0       0.58      0.49      0.53       207
           1       0.62      0.70      0.66       250

    accuracy                           0.61       457
   macro avg       0.60      0.60      0.60       457
weighted avg       0.60      0.61      0.60       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,101,106
Actual Men,74,176


In [None]:
param_grid = { 
    'n_estimators': [300,350,400],
    'max_features': ['auto','sqrt', 'log2'],
    'max_depth' : [None,1,2,4,5],
    'criterion' :['gini', 'entropy']
}
rfc=RandomForestClassifier(random_state=42)
rf_tuned = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,verbose=0)
rf_tuned.fit(X_train_vect, y_train)

In [22]:
model_score(rf_tuned,X_test_vect)

Accuracy: 0.6258205689277899
              precision    recall  f1-score   support

           0       0.65      0.38      0.48       207
           1       0.62      0.83      0.71       250

    accuracy                           0.63       457
   macro avg       0.63      0.60      0.59       457
weighted avg       0.63      0.63      0.60       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,79,128
Actual Men,43,207


In [23]:
rf_tuned.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'auto',
 'n_estimators': 300}

## **Cross validation - 5 folds**

In [None]:
from sklearn.model_selection import cross_validate

models = [dt_count,rf_count,rf_tfidf,svm_tfidf,svm_tuned,rf_tuned]

score=['accuracy','f1','precision','recall']

acc_list = []
f1_list = []
prec_list = []
rec_list = []


vect = CountVectorizer()

x_vect = vect.fit_transform(x)

for model in models:
    res = cross_validate(model,x_vect,y,scoring = score,cv=5)
    acc = (res['test_accuracy'].mean())
    f1 = (res['test_f1'].mean())
    prec =(res['test_precision'].mean())
    rec =(res['test_recall'].mean())
    acc_list.append(acc)
    f1_list.append(f1)
    prec_list.append(prec)
    rec_list.append(rec)



In [25]:
score=['accuracy','f1','precision','recall']


cross_val_df = pd.DataFrame({'Models': models,'Accuracy':acc_list,'F1':f1_list,'Precision':prec_list,'Recall':rec_list})

cross_val_df

Unnamed: 0,Models,Accuracy,F1,Precision,Recall
0,DecisionTreeClassifier(),0.560704,0.597979,0.593361,0.603198
1,"(DecisionTreeClassifier(max_features='sqrt', r...",0.603379,0.668501,0.610659,0.741176
2,"(DecisionTreeClassifier(max_features='sqrt', r...",0.591874,0.663966,0.600215,0.744186
3,"(StandardScaler(with_mean=False), SVC(gamma='a...",0.543762,0.704255,0.543514,1.0
4,"GridSearchCV(estimator=SVC(),\n pa...",0.539913,0.58371,0.566092,0.619278
5,"GridSearchCV(cv=5, estimator=RandomForestClass...",0.599528,0.68128,0.600859,0.790513


## **Creating a Stemmed Dataset**

In [26]:
df_stemmed = df_predict.copy()


from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df_stemmed['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_stemmed['lyrics']] 


from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# Get the stemmed_tokens
df_stemmed['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df_stemmed['tokenized_text'] ]




In [27]:
df_stemmed

Unnamed: 0,lyrics,gender,tokenized_text,stemmed_tokens
0,when your legs don't work like they used to b...,1,"[when, your, legs, don, work, like, they, used...","[when, your, leg, don, work, like, thei, us, t..."
1,i'm gonna pick up the pieces and build a lego...,1,"[gonna, pick, up, the, pieces, and, build, leg...","[gonna, pick, up, the, piec, and, build, lego,..."
2,"white lips, pale face breathing in the snowfl...",1,"[white, lips, pale, face, breathing, in, the, ...","[white, lip, pale, face, breath, in, the, snow..."
3,i was so high i did not recognize the fire bur...,1,"[was, so, high, did, not, recognize, the, fire...","[wa, so, high, did, not, recogn, the, fire, bu..."
4,"may i have your attention, please? may i have...",1,"[may, have, your, attention, please, may, have...","[mai, have, your, attent, pleas, mai, have, yo..."
...,...,...,...,...
1823,common love isn't for us we created something ...,0,"[common, love, isn, for, us, we, created, some...","[common, love, isn, for, us, we, creat, someth..."
1824,i didn't ask for a free ride i only asked you...,0,"[didn, ask, for, free, ride, only, asked, you,...","[didn, ask, for, free, ride, onli, ask, you, t..."
1825,"day to night to morning, keep with me in the ...",0,"[day, to, night, to, morning, keep, with, me, ...","[dai, to, night, to, morn, keep, with, me, in,..."
1826,maybe it's the way you say my name maybe it's...,0,"[maybe, it, the, way, you, say, my, name, mayb...","[mayb, it, the, wai, you, sai, my, name, mayb,..."


In [28]:
x_stem = df_stemmed['stemmed_tokens']
y_stem = list(df_stemmed['gender'])
X_train_stem,X_test_stem,y_train_stem,y_test_stem = train_test_split(x_stem,y_stem,random_state=42)

In [29]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

vectorizer = CountVectorizer(analyzer=lambda x: x)
X_train_vect_s = vectorizer.fit_transform(X_train_stem)
X_test_vect_s = vectorizer.transform(X_test_stem)

rf_count_s = RandomForestClassifier()
rf_count_s.fit(X_train_vect_s,y_train)

In [30]:
model_score(rf_count_s,X_test_vect_s)

Accuracy: 0.6367614879649891
              precision    recall  f1-score   support

           0       0.66      0.41      0.50       207
           1       0.63      0.83      0.71       250

    accuracy                           0.64       457
   macro avg       0.64      0.62      0.61       457
weighted avg       0.64      0.64      0.62       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,84,123
Actual Men,43,207


In [None]:
param_grid = { 
    'n_estimators': [150,175,200,225,250,300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [None,1,2,4,10],
    'criterion' :['gini', 'entropy'],
    'min_samples_leaf':[1,2,4,10]
}
rfc=RandomForestClassifier(random_state=42)
rf_tuned_s = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5,verbose=0)
rf_tuned_s.fit(X_train_vect_s,y_train)


In [33]:
model_score(rf_tuned_s,X_test_vect_s)

Accuracy: 0.649890590809628
              precision    recall  f1-score   support

           0       0.69      0.41      0.51       207
           1       0.63      0.85      0.73       250

    accuracy                           0.65       457
   macro avg       0.66      0.63      0.62       457
weighted avg       0.66      0.65      0.63       457



Unnamed: 0,Predicted Women,Predicted Men
Actual Women,84,123
Actual Men,37,213
