In [12]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import genesis
nltk.download('genesis')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
genesis_ic = wn.ic(genesis, False, 0.0)

import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import roc_auc_score


[nltk_data] Downloading package genesis to
[nltk_data]     /Users/thomaswierda/nltk_data...
[nltk_data]   Unzipping corpora/genesis.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thomaswierda/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/thomaswierda/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/thomaswierda/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [56]:
class KNN_NLC_Classifer():
    def __init__(self, k=1, distance_type = 'path'):
        self.k = k
        self.distance_type = distance_type

    # This function is used for training
    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train

    # This function runs the K(1) nearest neighbour algorithm and
    # returns the label with closest match. 
    def predict(self, x_test):
        self.x_test = x_test
        y_predict = []

        for i in range(len(x_test)):
            max_sim = 0
            max_index = 0
            for j in range(self.x_train.shape[0]):
                temp = document_similarity(self,x_test[i], self.x_train[j])
                if temp > max_sim:
                    max_sim = temp
                    max_index = j
            y_predict.append(self.y_train[max_index])
        return y_predict

In [57]:
def convert_tag(self, tag):
        """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
        tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
        try:
            return tag_dict[tag[0]]
        except KeyError:
            return None


def doc_to_synsets(self, doc):
    """
        Returns a list of synsets in document.
        Tokenizes and tags the words in the document doc.
        Then finds the first synset for each word/tag combination.
        If a synset is not found for that combination it is skipped.

        Args:
            doc: string to be converted

        Returns:
            list of synsets
     """
    tokens = word_tokenize(doc+' ')
        
    l = []
    tags = nltk.pos_tag([tokens[0] + ' ']) if len(tokens) == 1 else nltk.pos_tag(tokens)
        
    for token, tag in zip(tokens, tags):
        syntag = self.convert_tag(tag[1])
        syns = wn.synsets(token, syntag)
        if (len(syns) > 0):
             l.append(syns[0])
    return l 

In [58]:
def similarity_score(self, s1, s2, distance_type = 'path'):
        """
          Calculate the normalized similarity score of s1 onto s2
          For each synset in s1, finds the synset in s2 with the largest similarity value.
          Sum of all of the largest similarity values and normalize this value by dividing it by the
          number of largest similarity values found.

          Args:
              s1, s2: list of synsets from doc_to_synsets

          Returns:
            normalized similarity score of s1 onto s2
        """
        s1_largest_scores = []

        for i, s1_synset in enumerate(s1, 0):
            max_score = 0
            for s2_synset in s2:
                if distance_type == 'path':
                      score = s1_synset.path_similarity(s2_synset, simulate_root = False)
                else:
                      score = s1_synset.wup_similarity(s2_synset)                  
                if score != None:
                    if score > max_score:
                          max_score = score
              
            if max_score != 0:
                s1_largest_scores.append(max_score)
          
        mean_score = np.mean(s1_largest_scores)
                 
        return mean_score  

In [59]:
def document_similarity(self,doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""
    synsets1 = self.doc_to_synsets(doc1)
    synsets2 = self.doc_to_synsets(doc2)
          
    return (self.similarity_score(synsets1, synsets2) + self.similarity_score(synsets2, synsets1)) / 2

In [60]:

FILENAME = '/Users/thomaswierda/Documents/Study/Machine learning/dataset.csv'
df = pd.read_csv(FILENAME)
df = df.drop(['Singer','Tags'], axis=1)
genres = []

for i, value in df.iterrows():
    genre = df['Genre'][i]
    index = 0
    if genre == '[\'Pop\']': index = 0
    elif genre == '[\'Rock\']': index = 1
    elif genre == '[\'Hip-Hop/Rap\']': index = 2
    elif genre == '[\'Country\']': index = 3
    elif genre == '[\'R&B/Soul\']': index = 4
    elif genre == '[\'Metal\']': index = 5
    elif genre == '[\'Alternative/Indie\']': index = 6
    elif genre == '[\'Folk\']': index = 7
    genres.append(index)

dataset['output'] = genres
Num_Words = dataset.shape[0]

print(dataset.head())
print("\nSize of input file is ", dataset.shape)

        Singer                            Song                  Genre  \
0  $UICIDEBOY$                  40 Oz. & Sober  ['Alternative/Indie']   
1  $UICIDEBOY$           Drugs/Hoes/Money/Etc.        ['Hip-Hop/Rap']   
2  $UICIDEBOY$  I’ll Pay for It (If I Want It)        ['Hip-Hop/Rap']   
3  $UICIDEBOY$        Leave Your Things Behind        ['Hip-Hop/Rap']   
4  $UICIDEBOY$              Rag Round My Skull        ['Hip-Hop/Rap']   

                                              Lyrics                    Tags  \
0  \n\n\nKiller, killer, killer\nWhat the fuck yo...  ['Alternative', 'Rap']   
1  \n\n\nCounting sheep until I'm sound asleep\nI...                 ['Rap']   
2  \n\n\nFucking Ruby got a blunt with a flame at...                 ['Rap']   
3  \n\n\nAre you in or out?\nAre you in or out?\n...                 ['Rap']   
4  \n\n\nDrugs in my veins with two hoes on my la...                 ['Rap']   

   output  
0       6  
1       2  
2       2  
3       2  
4       2  

Size of

In [61]:
import re 
# 4. Train the Classifier
classifier = KNN_NLC_Classifer(k=1, distance_type='path')
lyrics = df['Lyrics']
genre = df['Genre']
lyrics_train, lyrics_test, genre_train, genre_test = train_test_split(lyrics, genre, train_size = 0.85, test_size = 0.15, shuffle=True)


classifier.fit(lyrics_train, genre_train)


y_pred_final = classifier.predict(lyrics_test)

output_df = pd.DataFrame(data = {'text': final_test_list, 'code': y_pred_final})
print(output_df)

KeyError: 0