In [38]:
from rap_db import *
from rap_clean import *
from rap_viz import line, verse_graph
from nltk.stem import *
from nltk import pos_tag
#import nltk
#nltk.download()
from textblob import TextBlob
import pandas as pd
import numpy as np
from copy import copy

In [2]:
doom = art_load(['Doom'])['Doom']
chief = art_load(['Chief Keef'])['Chief Keef']

In [3]:
#verse count for each
print(len(doom.uniq_art_verses))
print(len(chief.uniq_art_verses))

112
148


In [4]:
class line_data():
    def __init__(self, in_line_obj):
        self.line_obj = in_line_obj    
        self.gen_line_stem()
        self.gen_line_metrics()

    #want to run in multinomial and bernouli ways (one with frequency one with there not there binary)
    def gen_line_stem(self):
        stemmer = SnowballStemmer("english")
        words_stm = [stemmer.stem(w.lower()) for w in self.line_obj.words_as_strings if stemmer.stem(w.lower()) not in stopwords.words('english')]
        self.all_stemmed_words = list(filter(None, words_stm))
        self.unique_stemmed_words = set(self.all_stemmed_words)
        
    #may want to add stemming if accuracy shitty
    def gen_line_metrics(self):
        #get the vowel strings needed
        ex_vwls = self.line_obj.vowel_sounds
        nr_vwls = [v[:2] for v in ex_vwls]
        brd_vwls = [v[:1] for v in ex_vwls]
        #then vowel sounds for unique words, do it this way to not remake word objects
        check = set()
        ex_vwls_uniqs = []
        for w in self.line_obj.word_objs:
            if w.text.lower() not in check:
                check = check|{w.text.lower()}
                ex_vwls_uniqs.extend(list(zip(*w.matches))[1])
        nr_vwls_uniqs = [v[:2] for v in ex_vwls_uniqs]
        brd_vwls_uniqs = [v[:1] for v in ex_vwls_uniqs]
        
        #these are used a lot
        wrds = self.line_obj.words_as_strings
        unq_wrds = self.line_obj.uniq_words_as_strings
        wrd_cnt = len(wrds)
        unq_wrd_cnt = len(unq_wrds)
        blobs = TextBlob(" ".join(wrds)).sentiment
        
        #word based metrics
        self.metrics={'avg_wrd_len':sum(map(len,wrds))/wrd_cnt,
        'avg_unq_wrd_len':sum(map(len,unq_wrds))/unq_wrd_cnt,
        'unq_wrds_rat':unq_wrd_cnt/wrd_cnt,
                      
        #vowel based metrics
            #average vowel sounds per word
        'avg_wrd_vwls':len(ex_vwls)/wrd_cnt,
            #average vowel sounds per unique word
        'avg_unq_wrd_vwls':len(ex_vwls_uniqs)/unq_wrd_cnt,
            #average unique vowel sounds per word
        'avg_wrd_brd_unq_vwls':len(set(brd_vwls))/wrd_cnt,
        'avg_wrd_nr_unq_vwls':len(set(nr_vwls))/wrd_cnt,
        'avg_wrd_ex_unq_vwls':len(set(ex_vwls))/wrd_cnt,
            #average unique vowel sounds per unique word
        'avg_unq_wrd_brd_unq_vwls':len(set(brd_vwls_uniqs))/unq_wrd_cnt,
        'avg_unq_wrd_nr_unq_vwls':len(set(nr_vwls_uniqs))/unq_wrd_cnt,
        'avg_unq_wrd_ex_unq_vwls':len(set(ex_vwls_uniqs))/unq_wrd_cnt,
                      
        #specialized metrics
        'pol':blobs.polarity,
        'subj':blobs.subjectivity,
        'uniq_pos_rat': len(set(list(zip(*pos_tag(wrds)))[1]))/wrd_cnt,
        'uniq_pos_unq_wrd_rat': len(set(list(zip(*pos_tag(unq_wrds)))[1]))/unq_wrd_cnt}

In [5]:
def art_to_verse_graph(art_obj, inp_pop=False, inp_exc_line=True, inp_opto_type='near'):#opto stuff here
    ret_verse_graphs = []
    for s in art_obj.songs:
        for v in s.uniq_art_verses:
            verse_g = verse_graph(v, art_obj.name, s.name)
            verse_g.opto_matches(pop=inp_pop, exc_line=inp_exc_line, opto_type=inp_opto_type, record=False)
            ret_verse_graphs.append(verse_g)
    return ret_verse_graphs

In [6]:
def verse_graph_to_lines(verse_graph_obj):
    ret_lines = []
    for v_line in verse_graph_obj.ver_as_lines:
        if v_line.word_objs:
            line_data_obj = line_data(v_line)
            ret_lines.append(line_data_obj)
    return ret_lines

In [31]:
#line_count
all_doom_lines = [ver for verse_g in art_to_verse_graph(doom, inp_pop=2, inp_exc_line=False, inp_opto_type='exact') for ver in verse_graph_to_lines(verse_g)]
print(len(all_doom_lines))
all_chief_lines = [ver for verse_g in art_to_verse_graph(chief, inp_pop=False, inp_exc_line=True, inp_opto_type='near') for ver in verse_graph_to_lines(verse_g)]
len(all_chief_lines)

2873


2012

In [43]:
def create_line_data(lines, art_name):
    metric_df = pd.DataFrame(columns = list(lines[0].metrics.keys())+['artist'])
    #may not want to use pandas here
    lingustic_df = pd.DataFrame(columns=['artist','text','unique_text'])
    for l in lines:
        app_dic = copy(l.metrics)
        app_dic.update({'artist':art_name})
        metric_df = metric_df.append(app_dic, ignore_index=True)
        lingustic_df = lingustic_df.append({'text':l.all_stemmed_words, 'unique_text':l.unique_stemmed_words, 'artist':art_name}, ignore_index=True)
    return metric_df, lingustic_df

In [46]:
me, li = create_line_data(all_doom_lines, 'MF Doom')

In [47]:
me.head()

Unnamed: 0,avg_wrd_len,avg_unq_wrd_len,unq_wrds_rat,avg_wrd_vwls,avg_unq_wrd_vwls,avg_wrd_brd_unq_vwls,avg_wrd_nr_unq_vwls,avg_wrd_ex_unq_vwls,avg_unq_wrd_brd_unq_vwls,avg_unq_wrd_nr_unq_vwls,avg_unq_wrd_ex_unq_vwls,pol,subj,uniq_pos_rat,uniq_pos_unq_wrd_rat,artist
0,3.625,3.625,1.0,1.375,1.375,0.5,0.75,0.875,0.5,0.75,0.875,-0.05,0.05,0.875,0.625,MF Doom
1,3.083333,3.083333,1.0,1.083333,1.083333,0.333333,0.5,0.5,0.333333,0.5,0.5,0.35,0.65,0.916667,0.75,MF Doom
2,3.875,4.285714,0.875,1.25,1.285714,0.5,0.875,0.875,0.571429,1.0,1.0,0.0,0.0,0.5,0.714286,MF Doom
3,3.4,3.4,1.0,1.2,1.2,0.4,0.6,0.7,0.4,0.6,0.7,0.0,0.0,0.8,0.8,MF Doom
4,4.111111,4.111111,1.0,1.222222,1.222222,0.333333,0.777778,0.888889,0.333333,0.777778,0.888889,1.0,0.3,0.666667,0.666667,MF Doom


In [48]:
li.head()

Unnamed: 0,artist,text,unique_text
0,MF Doom,"[excus, mister, got, sister]","{sister, mister, excus, got}"
1,MF Doom,"[kiss, true, got, blister]","{kiss, got, true, blister}"
2,MF Doom,"[movi, plot, twist, like, twistler]","{twistler, like, twist, movi, plot}"
3,MF Doom,"[need, meat, burn, id, go, sizzler]","{need, sizzler, burn, id, meat, go}"
4,MF Doom,"[get, paid, like, biker, best, crank]","{paid, biker, like, best, crank, get}"


What you did
both could always use work
built stemmer
built full metrics method

What you need to do next

Notes

Long term
Train models using two different training methadologies
1. text bag of words (simply look at words in textand classify using a naive bayes, random forest, SVM)
2. make a row for every line based on the whiteboarded lingustic measures (def use svm, maybe random forest, maybe KNN)