#Profanity Extraction Notebook
Extract counts of profanity per decade and first mentions of offensive words. The output of this notebook will be used for Tableau Vizualizations.

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
## MLJ: Additional Extras
import os
import time
import itertools
import json
import pickle

In [3]:
# root in
root_in = "../../data/conditioned/corpus_vocabs/"
# root out
root_out = "../../viz/data/"

In [4]:
# adapted from https://justgagan.wordpress.com/2010/09/22/python-create-path-or-directories-if-not-exist/
def assureDirExists(path):
    d = os.path.dirname(path)
    if not os.path.exists(d):
        os.makedirs(d)

In [5]:
# make sure key directories exist
assureDirExists(root_in)
assureDirExists(root_out)

##Load Profanity Words
**These are words with more direct offensive meaning used 5 or more times throughout the corpus**

In [6]:
# load the offensive dataframe pulled from the master
offensivedf = pd.read_csv("../../data/conditioned/noun_n-gram_offensive.csv")  

In [7]:
offensivedf.shape

(20, 2)

In [8]:
offensivedf.head(20)

Unnamed: 0,count,word
0,143,shit
1,134,bitch
2,128,ass
3,127,nigga
4,50,niggas
5,40,dick
6,35,ho
7,30,booty
8,28,pussy
9,24,bone


In [9]:
offensives = offensivedf['word'].values
len(offensives)

20

In [10]:
print "{}".format(('butt' in offensives))

True


##Load Vocab

In [11]:
# function to ensure elements in list are ascii
def listAsAscii(lst):
    return [x.encode('ascii','ignore') if isinstance(x, unicode) else x for x in lst]

In [12]:
# function for loading dictionary json to columnar dataframe
def jsonDictToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    # read to json
    with open(root_in + json_name, 'r') as fp:
        j = json.load(fp)
    
    d = {key_col_label: listAsAscii(j.keys()), val_col_label: listAsAscii(j.values())}
    return pd.DataFrame(data=d)  

In [13]:
# function for saving dataframe to csv
def dataframeToCsv(df, csv_name, root_out=root_out, index=False):
    df.to_csv(root_out+csv_name,index=index)

In [14]:
decades = [1970,1980,1990,2000,2010]

In [15]:
# set up a structure for computing profanity
ncomp = {}

# initialize ncomp to hold all words with 0 value for each decade
for x in offensives:    
    ncomp[x]=[0,0,0,0,0]   

In [16]:
# get the count for each word

def populateDecadeWords(comp,decade,json_name):
    
    # set decade
    didx = decades.index(decade)
    
    # change root in for decade
    drootin = "../../data/conditioned/decades/"+str(decade)+"/"
    
    # read to json
    with open(drootin + json_name, 'r') as fp:
        j = json.load(fp)
    
    #set decade value for each in j
    for x in j:
        if x[0] in comp:
            comp[x[0]][didx] = x[1]
    
    return comp

In [17]:
# ncomp for nouns
for d in decades:
    ncomp = populateDecadeWords(ncomp,d,'noun_n-gram_reduced.json')

In [18]:
#verify ncomp
print ncomp.keys()[0]
print ncomp[ncomp.keys()[0]]

ass
[0, 1, 32, 68, 27]


In [19]:
# populate a column full of a given decades values from a comp
def compCol(comp,decade):
    didx = decades.index(decade)
    vs = []
    for k,v in comp.iteritems():
        vs.append(v[didx])
        
    return vs

# function to convert comp to dataframe and save
def compToDataframe(comp):
    d = {'word': listAsAscii(comp.keys())}
    
    for decade in decades:
        d[str(decade)] = compCol(comp,decade)
    
    return pd.DataFrame(data=d)

In [20]:
ncompdf = compToDataframe(ncomp)

In [21]:
ncompdf.head()

Unnamed: 0,1970,1980,1990,2000,2010,word
0,0,1,32,68,27,ass
1,0,0,37,58,32,nigga
2,1,0,10,15,4,booty
3,0,0,1,11,1,pimpin
4,0,0,6,14,2,pimp


###Save Profanity Dataframe

In [22]:
# ncompdf
dataframeToCsv(ncompdf,'noun_decade_offensives_reduced.csv')

##When was the first mention of each offensive word?

In [24]:
# working with this to find the first.
offensives

array(['shit', 'bitch', 'ass', 'nigga', 'niggas', 'dick', 'ho', 'booty',
       'pussy', 'bone', 'hoe', 'pimp', 'motherfucker', 'niggaz', 'butt',
       'pimpin', 'bullshit', 'cock', 'titty', 'whore'], dtype=object)

In [25]:
# load the use-this-master
# load the latest master lyricsdf
lyricsdf = pd.read_csv("../../data/conditioned/master-lyricsdf-word_syn_hype_vectors.csv")  

In [27]:
lyricsdf.head(1)

Unnamed: 0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract,noun_vector,adj_vector,noun_syn_vector,adj_syn_vector,noun_syn_hype_vector,adj_syn_hype_vector
0,0,1,1970,https://en.wikipedia.org/wiki/Bridge_over_Trou...,Bridge over Troubled Water,Simon and Garfunkel,When you're weary. Feeling small. When tears a...,1970,1970-1,http://lyrics.wikia.com/Simon_And_Garfunkel:Br...,When you're weary. Feeling small. When tears a...,time bridge water,rough troubled,bridge time water,rough troubled,bridge time water,rough troubled


In [40]:
first_offensive = {}
found_offensive = []

for r in lyricsdf.iterrows():
    nouns = r[1].noun_vector
    # test for NaN
    if not isinstance(nouns,float):
        vs = nouns.split()
        for o in offensives:
            # pick a winner
            if not o in found_offensive and o in vs:                
                first_offensive[o] = r[1].song_key
                found_offensive.append(o)

In [34]:
len(offensives)

20

In [38]:
len(found_offensive)

20

In [44]:
len(first_offensive)

20

In [41]:
first_offensive

{'ass': '1983-25',
 'bitch': '1977-23',
 'bone': '1977-36',
 'booty': '1977-41',
 'bullshit': '1973-92',
 'butt': '1987-59',
 'cock': '1999-39',
 'dick': '1971-89',
 'ho': '1992-2',
 'hoe': '1993-53',
 'motherfucker': '1993-11',
 'nigga': '1993-2',
 'niggas': '1993-11',
 'niggaz': '1992-67',
 'pimp': '1993-81',
 'pimpin': '1996-64',
 'pussy': '1975-78',
 'shit': '1970-3',
 'titty': '1999-75',
 'whore': '1994-82'}

In [42]:
firstoffensivedf = lyricsdf[lyricsdf['song_key'].isin(first_offensive.values())]

In [43]:
firstoffensivedf.shape

(19, 17)

In [45]:
firstoffensivedf.head(20)

Unnamed: 0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract,noun_vector,adj_vector,noun_syn_vector,adj_syn_vector,noun_syn_hype_vector,adj_syn_hype_vector
2,2,3,1970,https://en.wikipedia.org/wiki/American_Woman_(...,American Woman,The Guess Who,"Mmm, da da da. Mmm, mmm, da da da. Mmm, mmm, d...",1970,1970-3,http://lyrics.wikia.com/The_Guess_Who:American...,"Mmm, da da da. Mmm, mmm, da da da. Mmm, mmm, d...",woman mess mind mama thing time growin light y...,american important old coloured leave,crap light ma mess mind thing time woman,american colored important old,crap light ma mess mind thing time woman,american colored important old
188,188,89,1971,https://en.wikipedia.org/wiki/Theme_from_Shaft,Theme from Shaft,Isaac Hayes,Who's the black private dick. That's a sex mac...,1970,1971-89,http://lyrics.wikia.com/Isaac_Hayes:Theme_From...,Who's the black private dick. That's a sex mac...,dick right cat shaft mother man,black private damn bad complicated,cat dick man mother right shaft,bad black complicated damn private,cat dick man mother right shaft,bad black complicated damn private
391,391,92,1973,https://en.wikipedia.org/wiki/Money_(Pink_Floy...,Money,Pink Floyd,"Money, get away. Get a good job with more pay ...",1970,1973-92,http://lyrics.wikia.com/Pink_Floyd:Money,"Money, get away. Get a good job with more pay ...",job pay hand stack bullshit,good okay right,bullshit hand occupation stack wage,all_right good right,bullshit hand occupation stack wage,all_right good right
577,577,78,1975,https://en.wikipedia.org/wiki/Killer_Queen,Killer Queen,Queen,She keeps her Mot et Chandon. In her pretty ca...,1970,1975-78,http://lyrics.wikia.com/Queen:Killer_Queen,She keeps her Mot et Chandon. In her pretty ca...,cabinet remedy appetite drop hat pussy cat,pretty built-in insatiable willing playful,appetite cabinet cat cunt drop hat redress,built-in insatiate playful pretty willing,appetite cabinet cat cunt drop hat redress,built-in insatiate playful pretty willing
722,722,23,1977,https://en.wikipedia.org/wiki/Rich_Girl_(Hall_...,Rich Girl,Hall and Oates,You're a rich girl and you've gone too far. 'C...,1970,1977-23,http://lyrics.wikia.com/Hall_%26_Oates:Rich_Girl,You're a rich girl and you've gone too far. 'C...,girl man money rain pain bitch,rich old dry easy feel,bitch girl man money pain rain,dry easy old rich,bitch girl man money pain rain,dry easy old rich
735,735,36,1977,https://en.wikipedia.org/wiki/Blinded_by_the_L...,Blinded by the Light,Manfred Mann's Earth Band,Blinded by the light. Wrapped up like a deuce....,1970,1977-36,http://lyrics.wikia.com/Manfred_Mann%27s_Earth...,Blinded by the light. Wrapped up like a deuce....,diplomat pump sonny song break outside preache...,teenage adolescent strong funky safe east funn...,bone chaperon cub dance diplomat girl interrup...,adolescent amusing east fetid frozen new safe ...,bone chaperon cub dance diplomat girl interrup...,adolescent amusing east fetid frozen new safe ...
740,740,41,1977,https://en.wikipedia.org/wiki/Dazz,Dazz,Brick,Everybody go on and dance if you want to. Musi...,1970,1977-41,http://lyrics.wikia.com/Brick:Dazz,Everybody go on and dance if you want to. Musi...,dancing booty,funky,dancing loot,fetid,dancing loot,fetid
1324,1324,25,1983,https://en.wikipedia.org/wiki/Little_Red_Corvette,Little Red Corvette,Prince,I guess I should've known by the way you parke...,1980,1983-25,http://lyrics.wikia.com/Prince:Little_Red_Corv...,I guess I should've known by the way you parke...,cause horse honey red ill picture verge try lo...,dumb pocket little obscene tame red ow right,ailment attempt body brink buttocks cause hone...,dense obscene red right small tame,ailment attempt body brink buttocks cause hone...,dense obscene red right small tame
1758,1758,59,1987,https://en.wikipedia.org/wiki/Bad_(Michael_Jac...,Bad,Michael Jackson,"Na! Your butt is mine, gonna tell you right (N...",1980,1987-59,http://lyrics.wikia.com/Michael_Jackson:Bad,"Na! Your butt is mine, gonna tell you right (N...",butt face daylight count stuff mouth sh word l...,na broad bad wrong long cheap better,baby battle butt count day eye face lock man m...,bad better cheap incorrect long wide,baby battle butt count day eye face lock man m...,bad better cheap incorrect long wide
2201,2201,2,1992,https://en.wikipedia.org/wiki/Baby_Got_Back,Baby Got Back,Sir Mix-a-Lot,. I like big butts and I cannot lie. You other...,1990,1992-2,http://lyrics.wikia.com/Sir_Mix-A-Lot:Baby_Got...,. I like big butts and I cannot lie. You other...,butt girl waist cause groupie sweat turbo vett...,big itty-bitty average wet tired flat black he...,bean bimbo brother butt cause double function ...,average bitty black difficult flat healthy in-...,bean bimbo brother butt cause double function ...,average bitty black difficult flat healthy in-...


In [46]:
# flip around for adding to dataframe
song_key_offensive = {}

for o,sk in first_offensive.iteritems():
    if sk in song_key_offensive:
        song_key_offensive[sk].append(o)
    else:
        song_key_offensive[sk] = [o]
    

In [47]:
song_key_offensive

{'1970-3': ['shit'],
 '1971-89': ['dick'],
 '1973-92': ['bullshit'],
 '1975-78': ['pussy'],
 '1977-23': ['bitch'],
 '1977-36': ['bone'],
 '1977-41': ['booty'],
 '1983-25': ['ass'],
 '1987-59': ['butt'],
 '1992-2': ['ho'],
 '1992-67': ['niggaz'],
 '1993-11': ['niggas', 'motherfucker'],
 '1993-2': ['nigga'],
 '1993-53': ['hoe'],
 '1993-81': ['pimp'],
 '1994-82': ['whore'],
 '1996-64': ['pimpin'],
 '1999-39': ['cock'],
 '1999-75': ['titty']}

In [48]:
song_key_ostr = {}

for sk, os in song_key_offensive.iteritems():
    song_key_ostr[sk] = " ".join(os)

In [49]:
song_key_ostr

{'1970-3': 'shit',
 '1971-89': 'dick',
 '1973-92': 'bullshit',
 '1975-78': 'pussy',
 '1977-23': 'bitch',
 '1977-36': 'bone',
 '1977-41': 'booty',
 '1983-25': 'ass',
 '1987-59': 'butt',
 '1992-2': 'ho',
 '1992-67': 'niggaz',
 '1993-11': 'niggas motherfucker',
 '1993-2': 'nigga',
 '1993-53': 'hoe',
 '1993-81': 'pimp',
 '1994-82': 'whore',
 '1996-64': 'pimpin',
 '1999-39': 'cock',
 '1999-75': 'titty'}

In [50]:
def offensiveStrLookup(song_key):
    if song_key in song_key_ostr:
        return song_key_ostr[song_key]
    return np.nan

In [51]:
# add the offensives column
firstoffensivedf['first_offensive_words'] = firstoffensivedf.song_key.apply(offensiveStrLookup)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


In [53]:
firstoffensivedf.head(20)

Unnamed: 0,index,position,year,title.href,title,artist,lyrics,decade,song_key,lyrics_url,lyrics_abstract,noun_vector,adj_vector,noun_syn_vector,adj_syn_vector,noun_syn_hype_vector,adj_syn_hype_vector,first_offensive_words
2,2,3,1970,https://en.wikipedia.org/wiki/American_Woman_(...,American Woman,The Guess Who,"Mmm, da da da. Mmm, mmm, da da da. Mmm, mmm, d...",1970,1970-3,http://lyrics.wikia.com/The_Guess_Who:American...,"Mmm, da da da. Mmm, mmm, da da da. Mmm, mmm, d...",woman mess mind mama thing time growin light y...,american important old coloured leave,crap light ma mess mind thing time woman,american colored important old,crap light ma mess mind thing time woman,american colored important old,shit
188,188,89,1971,https://en.wikipedia.org/wiki/Theme_from_Shaft,Theme from Shaft,Isaac Hayes,Who's the black private dick. That's a sex mac...,1970,1971-89,http://lyrics.wikia.com/Isaac_Hayes:Theme_From...,Who's the black private dick. That's a sex mac...,dick right cat shaft mother man,black private damn bad complicated,cat dick man mother right shaft,bad black complicated damn private,cat dick man mother right shaft,bad black complicated damn private,dick
391,391,92,1973,https://en.wikipedia.org/wiki/Money_(Pink_Floy...,Money,Pink Floyd,"Money, get away. Get a good job with more pay ...",1970,1973-92,http://lyrics.wikia.com/Pink_Floyd:Money,"Money, get away. Get a good job with more pay ...",job pay hand stack bullshit,good okay right,bullshit hand occupation stack wage,all_right good right,bullshit hand occupation stack wage,all_right good right,bullshit
577,577,78,1975,https://en.wikipedia.org/wiki/Killer_Queen,Killer Queen,Queen,She keeps her Mot et Chandon. In her pretty ca...,1970,1975-78,http://lyrics.wikia.com/Queen:Killer_Queen,She keeps her Mot et Chandon. In her pretty ca...,cabinet remedy appetite drop hat pussy cat,pretty built-in insatiable willing playful,appetite cabinet cat cunt drop hat redress,built-in insatiate playful pretty willing,appetite cabinet cat cunt drop hat redress,built-in insatiate playful pretty willing,pussy
722,722,23,1977,https://en.wikipedia.org/wiki/Rich_Girl_(Hall_...,Rich Girl,Hall and Oates,You're a rich girl and you've gone too far. 'C...,1970,1977-23,http://lyrics.wikia.com/Hall_%26_Oates:Rich_Girl,You're a rich girl and you've gone too far. 'C...,girl man money rain pain bitch,rich old dry easy feel,bitch girl man money pain rain,dry easy old rich,bitch girl man money pain rain,dry easy old rich,bitch
735,735,36,1977,https://en.wikipedia.org/wiki/Blinded_by_the_L...,Blinded by the Light,Manfred Mann's Earth Band,Blinded by the light. Wrapped up like a deuce....,1970,1977-36,http://lyrics.wikia.com/Manfred_Mann%27s_Earth...,Blinded by the light. Wrapped up like a deuce....,diplomat pump sonny song break outside preache...,teenage adolescent strong funky safe east funn...,bone chaperon cub dance diplomat girl interrup...,adolescent amusing east fetid frozen new safe ...,bone chaperon cub dance diplomat girl interrup...,adolescent amusing east fetid frozen new safe ...,bone
740,740,41,1977,https://en.wikipedia.org/wiki/Dazz,Dazz,Brick,Everybody go on and dance if you want to. Musi...,1970,1977-41,http://lyrics.wikia.com/Brick:Dazz,Everybody go on and dance if you want to. Musi...,dancing booty,funky,dancing loot,fetid,dancing loot,fetid,booty
1324,1324,25,1983,https://en.wikipedia.org/wiki/Little_Red_Corvette,Little Red Corvette,Prince,I guess I should've known by the way you parke...,1980,1983-25,http://lyrics.wikia.com/Prince:Little_Red_Corv...,I guess I should've known by the way you parke...,cause horse honey red ill picture verge try lo...,dumb pocket little obscene tame red ow right,ailment attempt body brink buttocks cause hone...,dense obscene red right small tame,ailment attempt body brink buttocks cause hone...,dense obscene red right small tame,ass
1758,1758,59,1987,https://en.wikipedia.org/wiki/Bad_(Michael_Jac...,Bad,Michael Jackson,"Na! Your butt is mine, gonna tell you right (N...",1980,1987-59,http://lyrics.wikia.com/Michael_Jackson:Bad,"Na! Your butt is mine, gonna tell you right (N...",butt face daylight count stuff mouth sh word l...,na broad bad wrong long cheap better,baby battle butt count day eye face lock man m...,bad better cheap incorrect long wide,baby battle butt count day eye face lock man m...,bad better cheap incorrect long wide,butt
2201,2201,2,1992,https://en.wikipedia.org/wiki/Baby_Got_Back,Baby Got Back,Sir Mix-a-Lot,. I like big butts and I cannot lie. You other...,1990,1992-2,http://lyrics.wikia.com/Sir_Mix-A-Lot:Baby_Got...,. I like big butts and I cannot lie. You other...,butt girl waist cause groupie sweat turbo vett...,big itty-bitty average wet tired flat black he...,bean bimbo brother butt cause double function ...,average bitty black difficult flat healthy in-...,bean bimbo brother butt cause double function ...,average bitty black difficult flat healthy in-...,ho


In [54]:
# save off to dataframe
dataframeToCsv(firstoffensivedf,'noun_first_offensive_mentions.csv')