In [1]:
import numpy as np
import pandas as pd # for loading data
import sklearn 
import matplotlib.pyplot as plt
import re # for removing special characters
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords # For removing stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/nicktehrany/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nicktehrany/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("dataset.csv")
df = df.drop(['Singer', 'Date', 'Tags'], axis=1)

## Cleaning Lyrics (Lemmatizing, etc..) (Only for the first 100 songs right now)

In [3]:
corpus = []
formatted_text=""
lyrics = df['Lyrics']
sw = stopwords.words("english")
for i in range(0, 14400):
    text = re.sub('[^a-zA-Z]', ' ', str(lyrics[i])) # removes special characters
    text = text.lower() # lowercases everything
    text = text.split() # splits words
    text = [wordnet_lemmatizer.lemmatize(word, pos="v") for word in text if not word in set(sw)]
    formatted_text = ""
    for word in text:
        formatted_text+=word+" "
    corpus.append(formatted_text)
lyrics = corpus
for i in range(0,14400):
    df['Lyrics'][i] = lyrics[i]

## Partitioning the dataset into test and train sets

In [4]:
lyrics = df['Lyrics'][0:14400]
genre = df['Genre'][0:14400]
lyrics_train, lyrics_test, genre_train, genre_test = train_test_split(lyrics, genre, train_size = 0.85, test_size = 0.15, shuffle=True) 

In [5]:
vectorizer = TfidfVectorizer(min_df = 4)
train_data_tfid = vectorizer.fit_transform(lyrics_train).toarray() # Count Term Frequencies
test_data = vectorizer.transform(lyrics_test).toarray()

In [6]:
# Classification using LogisticRegression
lgr_clf = LogisticRegression() 
lgr_clf.fit(train_data_tfid, genre_train)
predictions = lgr_clf.predict(test_data)
print('Evaluation using LogisticRegression Classifer')
print('Accuracy score = {}'.format(accuracy_score(genre_test, predictions)))
print('Precision score = {}'.format(precision_score(genre_test, predictions, average = 'micro')))
print('Recall score = {}'.format(recall_score(genre_test, predictions, average = 'micro')))
print('F1 score = {}'.format(f1_score(genre_test, predictions, average = 'micro')))
print('---------------------------------------------------')



Evaluation using LogisticRegression Classifer
Accuracy score = 0.5162037037037037
Precision score = 0.5162037037037037
Recall score = 0.5162037037037037
F1 score = 0.5162037037037037
---------------------------------------------------
