# Lyrics Classification Challenge

In [24]:
import random

class Genre:
    HIPHOP = 'Hip Hop'
    ROCK = 'Rock'
    POP = 'Pop'
    UNKOWN = 'unknown'

class Lyric:
    
    def __init__(self, words:str, genre:str) -> None:
        self.words = words
        self.genre = genre

    def __repr__(self) -> str:
        return f'Lyric({self.words[0:5]}, {self.genre})'

    def get_genre(self) -> Genre:
        if self.genre == 'Hip Hop':
            return Genre.HIPHOP
        elif self.genre == 'Rock':
            return Genre.ROCK
        elif self.genre == 'Pop': 
            return Genre.POP
        else: # genre is unkown
            return Genre.UNKOWN 

# This lyric container was originally going to be used to give the model an even number of rock, pop, and hip hop songs
# thats why the evenlyDistribute function is defined
class LyricContainer:
    
    def __init__(self, lyrics) -> None:
        self.lyrics = lyrics
        random.shuffle(self.lyrics)

    def get_words(self) -> list:
        return [x.words for x in self.lyrics]

    def get_genre(self) -> list:
        return [x.get_genre() for x in self.lyrics]

    def evenlyDistribute(self) -> None:
        hiphop = list(filter(lambda x: x.get_genre() == Genre.HIPHOP, self.lyrics))
        rock = list(filter(lambda x: x.get_genre() == Genre.ROCK, self.lyrics))
        pop = list(filter(lambda x: x.get_genre() == Genre.POP, self.lyrics))

        rock_shrunk = rock[:len(hiphop)]
        pop_shrunk = pop[:len(hiphop)]

        self.lyrics = hiphop + pop_shrunk + rock_shrunk
        random.shuffle(self.lyrics)



## Load data


In [25]:
import pandas as pd

df = pd.read_csv('data\data.csv')

unfiltered_lyrics = list(map(lambda x:Lyric(x[0], x[1]), df.values.tolist()))

unk_lyrics = [l for l in unfiltered_lyrics if l.get_genre() == Genre.UNKOWN]

lyrics = list(set(unfiltered_lyrics) - set(unk_lyrics))



## Prep Data


In [26]:
# of lyrics : 40000 -> train ; 10000 -> estimate/test ; 5000 in unk_lyrics -> is the unkown lyrics for acc test
from sklearn.model_selection import train_test_split

training, test = train_test_split(lyrics, test_size=0.2, random_state=42)

train_cont = LyricContainer(training) 
test_cont = LyricContainer(test)

In [27]:
train_x = train_cont.get_words()
train_y = train_cont.get_genre()

test_x = test_cont.get_words()
test_y = test_cont.get_genre()


In [28]:
# for unk lyrics
unk_x = [x.words for x in unk_lyrics]
unk_y = [x.genre for x in unk_lyrics]

Bag of Words vectorization

In [29]:
from sklearn.feature_extraction.text import CountVectorizer 

# turning lyrics into vectors for model

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

# train_x_vectors
# train_y
#* ^ this is the data that will fit a model around for machine learning algo

In [30]:
# for unk lyrics
unk_x_vectors = vectorizer.transform(unk_x)

# unk_x_vectors
#* ^ this is the data that we will predict using our model 

## Classification

### Linear SVC


In [33]:
from sklearn.svm import LinearSVC

clf_svc = LinearSVC(penalty='l1', dual=False, C=0.1)
clf_svc.fit(train_x_vectors, train_y)

LinearSVC(C=0.1, dual=False, penalty='l1')

## Evaluation

In [34]:
from sklearn.metrics import accuracy_score

pred = clf_svc.predict(test_x_vectors)
estimated_accuracy = accuracy_score(test_y, pred)
pd.Series(estimated_accuracy).to_csv('ea.csv', index=False, header=False)

# Predicting Testing Set

In [35]:
pred = clf_svc.predict(unk_x_vectors)
pred = pd.Series(pred).to_csv('pred.csv', index=False, header=False)