# HHWeb

A network graph of rap artists' associations by shared phrases. These shared phrases could come about as a result of paying homage, intertextual allusion, or simple plagiarism.  Outside the realm of modern intellectual property, this is a common an well accepted practice in blues music, a genre where rap music has deep roots. 

I listen to a lot of rap music, and I've noticed this popping up.  It seemed like a fun way to jump into NLP. Really, something about juxtaposing AAVE and technical/academic language just tickles me.

This project will show directionality of borrowed phrases, inferred by release date, and cluster artist around influential artists with the most sampled/borrowed phrases.

The length of a phrase will be proportional to number of times it appear in order for it to be significant.  This means that a short phrase must be highly unique to count as a link (to ensure it is not simply a common part of speech), while longer phrases can be shared, as their probability of _not_ being attributable is much less.

In [23]:
import pandas as pd
import numpy as np
import glob
import re
from collections import defaultdict
import networkx as nx
from bs4 import BeautifulSoup
import urllib
from urllib.error import HTTPError
import time
import random
import re

# Tor stuff, this might not all be needed
# TODO Clean it up later
import stem
import requests
from stem import Signal
from stem.control import Controller
from stem.util import term
import socks as pysocks
import socket

In [2]:
# This can only be done once, moving to its own frame for debugging
controller = Controller.from_port(port = 9051)

In [13]:
def new_ident():
    controller.authenticate()
    controller.signal(Signal.NEWNYM)
    pysocks.setdefaultproxy(pysocks.PROXY_TYPE_SOCKS5 , "127.0.0.1", 9050, True)
    socket.socket = pysocks.socksocket
    print('Connecting from {}'.format(requests.get('http://icanhazip.com').text))
    time.sleep(10)


for i in range(0,1):    
    new_ident()

Connecting from 185.117.215.9



In [12]:
# Drilling down through the directory structure with the scraper
def pull_lyrics():
    new_ident()
    
    u = urllib.request.urlopen("http://ohhla.com/all.html")
    soup = BeautifulSoup(u, 'html.parser')
    submitters = soup.select('#leftmain table pre a')
    sub_pages = []
    for link in submitters: 
        rel = (link.get('href'))
        if rel:
            sub_pages.append('http://ohhla.com/' + rel)
    print(len(sub_pages), 'sub pages found.')
    batch_step = 1 
    for i in range(398, len(sub_pages), batch_step):
        batch = sub_pages[i:i+batch_step]
        lyric_text(batch)
        print('{} OF {} COMPLETED'.format((1+i)*batch_step, len(sub_pages)))

# This didn't solve the issue, but will be useful once we get Tor working
# Break into a seperate function here to avoid time outs
def lyric_text(passed_pages):
    # Made passed pages a set to cancel out the duplication for artist pages
    # TODO, see if this works
    
    for artist in set(passed_pages):
        try:
            sub_u = urllib.request.urlopen(artist) 
            sub_soup = (BeautifulSoup(sub_u, 'html.parser'))
            lyrics_pages = sub_soup.select('body table tr a')
            for lyrics_page in lyrics_pages[5:]:
                if lyrics_page:
                    try:
                        ly_lnk = artist+lyrics_page.get('href')
                        ly_u = urllib.request.urlopen(ly_lnk)
                        ly_soup = BeautifulSoup(ly_u, 'html.parser')
                        ly_txts = ly_soup.findAll(href=re.compile("\.txt$"))             
                        for ly_txt in ly_txts:

                                attempt_lyrics_save(ly_txt, ly_lnk)

                    # Some of the artists are organized in an artist page, or we could need a new identy
                    # Artist page is more likely so we check for it first
                    except (TypeError, HTTPError) as e:
                        try:
                            # Switch identity in case it was part of the issue
                            print('Nonstandard page exception', e, artist, )
                            # This keep pulling the same page multiple times, I think
                            # there are recursive links on these pages.  If you want to optimze,
                            # keep track of which nonstandard pages you've already hit and skip them.
                            # They are causeing a BIG slowdown
                            ly_lnk = 'http://ohhla.com/'+lyrics_page.get('href')
                            ly_u = urllib.request.urlopen(ly_lnk)
                            ly_soup = BeautifulSoup(ly_u, 'html.parser')
                            ly_txts = ly_soup.findAll(href=re.compile("\.txt$"))             
                            for ly_txt in ly_txts:
    #                             print('Trying http://ohhla.com/{}'.format(ly_txt))
                                attempt_lyrics_save(ly_txt, 'http://ohhla.com/')

                        except (TypeError, HTTPError) as e: 
                            print(ly_lnk + ' is not valid.  Skipping it.', e)      
                     # At this point we have a new identity, so check if that was the problem 
                    # This should only get hit if the format was normal *but* we exceeded the rate limit
                    except (TypeError, HTTPError):
                        print('Trying base format with new identity')
                        ly_lnk = artist+lyrics_page.get('href')
                        ly_u = urllib.request.urlopen(ly_lnk)
                        ly_soup = BeautifulSoup(ly_u, 'html.parser')
                        ly_txts = ly_soup.findAll(href=re.compile("\.txt$"))             
                        for ly_txt in ly_txts:

                                attempt_lyrics_save(ly_txt, ly_lnk)

                    except Exception as e:
                        print('A non TypeError has occured. ', e)
                        
        except Exception as e:
            print('Artist: {}\n raised {}'.format(artist, e))
        time.sleep(1+(random.random()*2))


def attempt_lyrics_save(page_in, ly_lnk_in, final_run=False):
    page = page_in.get('href')
    lyrics = urllib.request.urlopen(ly_lnk_in + page)
    lyrics_soup = BeautifulSoup(lyrics, 'html.parser')
    lyrics_cln = lyrics_soup.select('body div pre')
    lyrics_cln = str(lyrics_cln)[7:-7] # Stripping out the pre tags (I know it's ugly)
    filename = page.rsplit('/', 1)[-1]    
    
    if len(lyrics_cln) > 0:
        with open('lyrics/' + filename, 'w+') as out:
            out.write(str(lyrics_cln))
        if final_run==True:
            print('After identity switching, the lyrics length is {}'.format(len(lyrics_cln)))

    elif final_run==True:
        # We're being denied now
        print('We are getting a blank lyrics page', ly_lnk_in+page) 
        new_ident()
        attempt_lyrics_save(page_in, ly_lnk_in, final_run=True)
pull_lyrics()



Connecting from 46.166.148.176

1077 sub pages found.
Artist: http://ohhla.com/anonymous/alkapone.html
 raised HTTP Error 404: Not Found
112 OF 1077 COMPLETED
113 OF 1077 COMPLETED
114 OF 1077 COMPLETED
115 OF 1077 COMPLETED
116 OF 1077 COMPLETED
117 OF 1077 COMPLETED
118 OF 1077 COMPLETED
119 OF 1077 COMPLETED
120 OF 1077 COMPLETED
121 OF 1077 COMPLETED
122 OF 1077 COMPLETED
123 OF 1077 COMPLETED
124 OF 1077 COMPLETED
125 OF 1077 COMPLETED
126 OF 1077 COMPLETED
127 OF 1077 COMPLETED
128 OF 1077 COMPLETED
129 OF 1077 COMPLETED
130 OF 1077 COMPLETED
131 OF 1077 COMPLETED
132 OF 1077 COMPLETED
133 OF 1077 COMPLETED
134 OF 1077 COMPLETED
135 OF 1077 COMPLETED
136 OF 1077 COMPLETED
137 OF 1077 COMPLETED
138 OF 1077 COMPLETED
139 OF 1077 COMPLETED
140 OF 1077 COMPLETED
141 OF 1077 COMPLETED
142 OF 1077 COMPLETED
143 OF 1077 COMPLETED
144 OF 1077 COMPLETED
145 OF 1077 COMPLETED
146 OF 1077 COMPLETED
147 OF 1077 COMPLETED
148 OF 1077 COMPLETED
149 OF 1077 COMPLETED
150 OF 1077 COMPLETED
151 O

KeyboardInterrupt: 

In [41]:
# TODO talk to Christopher about borrowing this https://github.com/cing/rapwords/blob/master/RapWordsTalk.ipynb
# It's MIT licensed, but it would be nice to reach out
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

df_data = defaultdict(list)
for filename in glob.iglob('lyrics/*.txt', recursive=True):
    with open(filename, 'r') as f:
        stripped_lyrics = f.read()
        
        artist = re.search('Artist:\s*(.*)\s*\n', stripped_lyrics)
        song = re.search('Song:\s*(.*)\s*\n', stripped_lyrics)
        lyrics = re.search('Typed by:\s*(.*)\s*\n([\s\S]*)', stripped_lyrics)

        if artist is not None and song is not None and lyrics is not None:
            artist = artist.group(1)
            song = song.group(1)
            lyrics = lyrics.group(2).replace('\n', ' ')# group(1) is the transcriber
            lyrics = re.sub('[!-/:-@[-`{-~]', ' ', lyrics) # These tokens should be converted to spaces
            
            df_data["filename"].append(filename)
            df_data["artist"].append(''.join(re.findall('[a-zA-Z0-9\s]', artist)))
            df_data["song"].append(''.join(re.findall('[a-zA-Z0-9\s]', song)))
            df_data["lyrics"].append(''.join(re.findall('[a-zA-Z0-9\s]', lyrics)))  

In [42]:
display(pd.DataFrame(df_data))

Unnamed: 0,artist,filename,lyrics,song
0,Big Sean f Kendrick Lamar Royce Da 59,lyrics/100.bsn.txt,Intro Chorus As I look up to the sky Thought...,100
1,Aesop Rock,lyrics/1000.rck.txt,I wear shoes to bed Paint teeth on my lips Tre...,1000 OClock
2,Apathy,lyrics/1000gram.apy.txt,Three two one contact Chorus x4 R...,1000 Grams
3,CyHi The Prynce f Big Sean Chris Brown,lyrics/100_bott.cyh.txt,Chorus We re in the mother fucking building ...,100 Bottles
4,9th Prince,lyrics/100_deg.9th.txt,Intro 9th Prince Yo yeah yeah I don t eve...,100 Degrees
5,Ace Hood,lyrics/100_for.ace.txt,I m sick of these niggas I m tired of these n...,100 Foreva
6,Boogie Down Productions,lyrics/100_guns.bdp.txt,One two three four KRS One s...,100 Guns
7,Big Sean f Rick Ross Pusha T Clipse,lyrics/100_keys.bsn.txt,Ay ay Chorus Big Sean I m from a big cit...,100 Keys
8,BirdmanBaby f DJ Khaled Dre Lil Wayne Rick Ros...,lyrics/100_mill.brd.txt,DJ Khaled Dis for the hood dhis for the ghe...,100 Million
9,Above the Law f PEE GEE,lyrics/100spoke.atl.txt,Intro Cold 187Um Man I m at the light man...,100 Spokes


In [None]:
# This is just for testing, before I pull in the corpus
big_ego = "Artist: Dr. Dre f/ Hitman Album: The Chronic 2001 Song: Big Ego's Typed by: OHHLA Webmaster DJ Flash [Dr. Dre] I got mo' class than most of em, ran wit the best of em Forgave the less of em, and blazed at the rest of em What can I say? Cal-i-for-ni-A Where niggaz die everyday over some shit they say Disconnected from the streets forever As long as I got a baretta, nigga, I'm down for whateva I roll wit my shit off safety - for niggaz that been hatin me lately and the bitches that wanna break me If Cali blew up, I'd be in the Aftermath Bumpin gangsta rap shit, down to blast for cash Cause from Eazy-E, to D.O.C., to D.P.G. started from that S.O.B., D.R.E. Like Dub-C I'm rich rollin, pistol holdin Pockets swoll nigga, that's how I'm rollin Put the flame to the killer nigga Worldwide homicide mob figure and a builder, for real I'm hittin switches, makin bitches eat bitches See me grab my dick everytime I pose for pictures I own acres, floor seats watchin The Lakers I'm cool with eses who got AK's in cases Dedicated to all of those with big ego's Never fakin, we get the dough and live legal Haters hate this, we sip the Mo' and yank the heezos 1 - Niggaz play this in they Rovers Jeeps and Regals 2 - Bitches play this in they Benzes Jeeps and Geos {repeat 2X} [Hitman] I bust a Mr. Toughy, slash a Smoothy Doobie Crash and flex on Tuesday's, harassin hoes at movies Passin by with uzis - and who you aimin at? That shady bitch and that bitch nigga that was claimin that Rat-ta-tat-tat {*automatic gunfire and screaming*} {*more screaming as tires peel out*} I don't sympathize for wack hoes and wimpy guys You got to recognize Hitman is a enterprise Cali pride, born to ride and South Centralized The Henny got me energized - smoke the guys tryin to focus on mines - poke they eyes out I'm L.A.'s loc'est - hope they don't have to find out the hard way like snitch niggaz in the pen that get hit when the guards look the other way We hittin HARD, Hitman and Dre You playin games, I suggest you know the rules We puttin guns to fools, make you run yo' jewels Take yo' honey and cruise to the snootiest snooze, Cabos Pop coochie til the nut oozes, you shouldn't fuck wit crews that's sick, Aftermath cause we rule shit I'm Big Hit, don't confuse me wit no other by the flow motherfucker Dedicated to all of those with big ego's Never fakin, we get the dough and live legal Haters hate this, we sip the Mo' and yank the heezos 1 - Niggaz play this in they Rovers Jeeps and Regals 2 - Bitches play this in they Benzes Jeeps and Geos {repeat 2X}"
tribe = "[Hook: Q-Tip] Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Well, I'm gone (Go on then!) [Verse 1: Q-Tip] Can I kick it? To all the people who can Quest like A Tribe does Before this, did you really know what live was? Comprehend to the track, for it's why cuz Gettin measures on the tip of the vibers Rock and roll to the beat of the funk fuzz Wipe your feet really good on the rhythm rug If you feel the urge to freak, do the jitterbug Come and spread your arms if you really need a hug Afrocentric living is a big shrug A life filled with fun that's what I love A lower plateau is what we're above If you diss us, we won't even think of Will Nipper the doggy give a big shove? This rhythm really fits like a snug glove Like a box of positives it's a plus, love As the Tribe flies high like a dove (Can I kick it?) [Hook: Phife Dawg] Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Can I kick it? (Yes, you can!) Well, I'm gone (Go on then!) [Verse 2: Phife Dawg] Can I kick it? To my Tribe that flows in layers Right now, Phife is a poem sayer At times, I'm a studio conveyor Mr. Dinkins, would you please be my mayor? You'll be doing us a really big favor Boy this track really has a lot of flavor When it comes to rhythms, Quest is your savior Follow us for the funky behavior Make a note on the rhythm we gave ya Feel free, drop your pants, check your ha-ir Do you like the garments that we wear? I instruct you to be the obeyer A rhythm recipe that you'll savor Doesn't matter if you're minor or major Yes, the Tribe of the game we're a player As you inhale like a breath of fresh air (Can I kick it?)"
sage = "Can I kick it? (yes you can) [x3] Well I'm gone (go on then) Can I kick it, to all my people who get wicked like Sage does before this did you know what my real name was Paul Francis acting like he's on the same drugs Never even felt the authects of a strange buzz You never ever catch me holding a beer mug Your talking shit like as if you was a real thug if that's true lick a shot BUCK feel the slug that's what you get for totin guns like you were Elmer Fudd I'm selling tapes for three bones wanna catch a dub? this shit is dope kid it makes you wanna cut the rug Illuminati's got every part of my body bugged the micro chip is in your wrist now give it a tug be nice to females, give a bitch a hug Triple X styles comin cleaner than your tub you better tell your girl about it because she's a scrub A big brow never had a nip in the bud droppin me her seven digits while i'm in the club talkin bout I look I need a back rub son she's a natural disaster like a flash flood i ain't playin dawg you better go test her blood until your positive she's negative don't make no love with or without a glove, you know what i'm speaking of the cub scouts try and jump into the briney shrubs behind the bush turn a back push into a shove what you thinkin tryin bring the underground above? AOI make you cry like a dove,for that shit,for that shit "
denance = "[Intro] Last year I was Dr-Drib- dribble down the court Dr-Drib- dribble down the court This year I'm kicking it I'ma kick it for like a motherfucking soccer ball [Verse 1] I'm crazy, I lost my mind I can't find it But that's OK, cause being normal's not a fucking option Cause if it was, then rap wouldn't be my main focus I'd have a 9-to-5, a wife that'll hang my clothes up I'd have a couple kids, a house to call my home, but Something crazy happened, rap became my home, yup! Every since the evidence became so relevant That I was meant to set mics on fire, I've been hesitant But that's over and I'm killing what the hell has sent If you have an issue 'bout who I say I'm better than You can try to write a song, diss me if you ever can But the only thing you got on me is this Eminem (chka chka) It's getting old, we don't share no pens So stop all these dumb accusations and comparisons We ain't nothing alike, we just white So what's the problem between us, that's causing this fight? [Hook] Can I kick it? (Yes, you can!)(x3) Now let me show the whole world that I ain't playing around (x2) [Verse 2] I need a U-Haul to carry this weight I bury the hate, inside of a very big crate Too scary to stay You better be very afraid I carry a cape, I'm Superman, American made You a fairy with a glare and it's gay You compare yourself to the best when you barely can slay I bring urgent care when I rap, don't you get carried away \"Son, sit down, get a job\", something your parents will say And when I eat MC's, that's really only an errand to me You ain't even half decent, boy/girl, you're half retarded You're like a turtle next to me, I'm an Aston Martin These kids are hopin' to cash out with the rappin' art when They realize 20 years down the road, they haven't started A career, then its clear that you in fact, are garbage So, please sit down or walk yourself inside of coffin Let the pros handle the hustle while you stand there stalking Hating on every move we make, hoping we don't reach stardom [Hook] Can I kick it? (Yes, you can!)(x3) Now let me show the whole world that I ain't playing around (x2)"
tribe = tribe.split()
sage = sage.split()
denance = denance.split()

# There's probably a library out there that does this, but where's the fun in that?
# n=number of words in the cluster, lst1=Lyrics split into words
def ngram(n, lst):
    #We'll make the ngrams by zipping together a series of lists
    lists_to_zip = []
    lists_to_zip.append(lst)
    for i in range(n):
        # Each list should have one more padding that the previously cretaed one
        new_list = ['*padding*'] + (lists_to_zip[-1])
#         print 'NEW LIST ------\n', new_list
        lists_to_zip.append(new_list)
#         print 'lists_to_zip --------\n', lists_to_zip
    zipped_lists = zip(*lists_to_zip[::-1])
    return [x for x in zipped_lists if "*padding*" not in x]

In [None]:
# For each song, generate a dict of ngrams where x < n > y
def dict_ngram(lyrics, rng=None):
    res = {}
    for i in range (*rng):
        res[i] = ngram(i, lyrics)
    return res
    
# print(dict_ngram(tribe, rng=(3, 8) ))
# print(dict_ngram(sage, rng=(3, 8) ))
tribe_dict = dict_ngram(tribe, rng=(2, 8) )
sage_dict = dict_ngram(sage, rng=(2, 8) )

# TODO We if we are doing this with multiple lenght ngrams,
# we don't want shorter ones that are a subset of the longer ones.
# Find a way to only keep the longest ngram
# def longest_ngram (dict_x, dict_y, rng)
#     rng = reversed(rng)
#     for i in rng:
#         print(i, set(tribe_dict[i]).intersection(sage_dict[i]))
    


In [None]:
# Testing out the graphing library
G = nx.Graph()
           
edges = []
for i in range (2,8):
    n = (set(tribe_dict[i]).intersection(sage_dict[i]))
    for edg in n:
        print('Edge for {}gram'.format(i), edg)
        edges.append(' '.join(edg))
        
print('All edges \n', edges)
for edg in edges:
#     G.add_edge('tribe', 'sage')
    G.add_edge('tribe', 'sage', lyric=' '.join(edg))
    
import matplotlib.pyplot as plt    
%matplotlib inline
nx.draw(G)