In [1]:
def tokenize(lyric):
    lyric = lyric.lower() # converte para maiúsculas
    all_words = re.findall("[a-z0-9']+", lyric) # extrai as palavras
    return set(all_words) # set() remove palavras duplicadas

def count_words(training_set):
    """o conjunto em treinamento consiste de pares (lyric, genre)"""
    counts = defaultdict(lambda: [0,0,0,0])
    for lyric, genre in training_set:
        for word in tokenize(lyric):
            counts[word][genre] += 1
    return counts

def word_probabilities(counts, total_bossa, total_funk, total_gospel, total_sertanejo, k=0.5):
    """transforma o word_counts em uma lista de quinplas w, p(w|bossa), p(w|funk), p(w|gospel) e p(w|sert) """
    return [(w,
             (bossa + k) / (total_bossa + 2 * k),
             (funk + k) / (total_funk + 2 * k),
             (gospel + k) / (total_gospel + 2 * k),
             (sertanejo + k) / (total_sertanejo + 2 * k))
        for w, (bossa, funk, gospel, sertanejo) in counts.iteritems()]

import math

def genre_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_bossa = log_prob_if_funk = log_prob_if_sertanejo = log_prob_if_gospel = 0.0
    
    # iterate through each word in our vocabulary
    for word, prob_if_bossa, prob_if_funk, prob_if_sertanejo, prob_if_gospel in word_probs:
        # if *word* appears in the message,
        # add the log probability of seeing it
        if word in message_words:
            log_prob_if_bossa += math.log(prob_if_bossa)
            log_prob_if_funk += math.log(prob_if_funk)
            log_prob_if_sertanejo += math.log(prob_if_sertanejo)
            log_prob_if_gospel += math.log(prob_if_gospel)
        # if *word* doesn't appear in the message
        # add the log probability of _not_ seeing it
        # which is log(1 - probability of seeing it)
        else:
            log_prob_if_bossa += math.log(1.0 - prob_if_bossa)
            log_prob_if_funk += math.log(1.0 - prob_if_funk)
            log_prob_if_sertanejo += math.log(1.0 - prob_if_sertanejo)
            log_prob_if_gospel += math.log(1.0 - prob_if_gospel)
    prob_if_bossa = math.exp(log_prob_if_bossa)
    prob_if_funk = math.exp(log_prob_if_funk)
    prob_if_sertanejo = math.exp(log_prob_if_sertanejo)
    prob_if_gospel = math.exp(log_prob_if_gospel)
    prob_total = prob_if_bossa + prob_if_funk + prob_if_sertanejo + prob_if_gospel
    return [prob_if_bossa / prob_total, prob_if_funk / prob_total, prob_if_sertanejo / prob_total, prob_if_gospel / prob_total]

class NaiveBayesClassifier:
    
    # construtor da classe
    def __init__(self,genre='bossa', k=0.5):
        self.k = k
        self.genre = genre
        self.word_probs = []
        
    def train(self, training_set):
        
        # conta letras de cada genero
        num_bossa = len([is_bossa for lyric, is_bossa in training_set if is_bossa])
        num_funk = len([is_funk for lyric, is_funk in training_set if is_funk])
        num_gospel = len([is_gospel for lyric, is_gospel in training_set if is_gospel])
        num_sertanejo = len([is_sertanejo for lyric, is_sertanejo in training_set if is_sertanejo])
        
        # roda dados de treinamento pela nossa "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, num_bossa, num_funk, num_gospel, num_sertanejo, self.k)
        
    def classify(self, lyric):
        return genre_probability(self.word_probs, lyric)
    


In [2]:
import glob, re
# modify the path with wherever you've put the files
path = r"H:\datasets\lyrics\*"
data = []
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
    lyric = ''
    genre = 0 if "bossa" in fn else 1 if "funk" in fn else 2 if "sertanejo" in fn else 3 if "gospel" in fn else -1
    is_spam = "ham" not in fn
    
    with open(fn,'r') as file:
        for line in file:
            if line.startswith("lyric"):
                continue
            elif line.startswith('"'):
                if not lyric:
                    continue
                else:
                    data.append((lyric, genre))
                    lyric = ''
            else:
                lyric += line.replace('"', '').replace("'", "").rstrip().lstrip() + ' '

In [3]:
#data

In [4]:
import random
from collections import defaultdict

def split_data(data,prob):
    """ divide os dados em fracoes [prob, 1 - prob] """
    result = [],[]
    for row in data: result[0 if random.random() < prob else 1].append(row)
    return result

In [5]:
random.seed(0)
train_data, test_data = split_data(data,0.75)

In [6]:
# criando e treinando um classificador
classifier = NaiveBayesClassifier()
classifier.train(train_data)

In [8]:
print classifier.classify('Não conte os teus maiores sonhos a ninguém Não mostre a sua ferida para quem não tem Remédio pra curá-la e forças para te erguer Não não não não')

[8.708203795945177e-06, 2.0101258992045952e-05, 0.009767214986067002, 0.990203975551145]
