#Profanity Extraction Notebook
Extract counts of profanity per decade. The output of this notebook will be used for Tableau Vizualizations.

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
## MLJ: Additional Extras
import os
import time
import itertools
import json
import pickle

In [32]:
# root in
root_in = "../../data/conditioned/corpus_vocabs/"
# root out
root_out = "../../viz/data/"

In [4]:
# adapted from https://justgagan.wordpress.com/2010/09/22/python-create-path-or-directories-if-not-exist/
def assureDirExists(path):
    d = os.path.dirname(path)
    if not os.path.exists(d):
        os.makedirs(d)

In [5]:
# make sure key directories exist
assureDirExists(root_in)
assureDirExists(root_out)

##Load Profanity Words
**These are words with more direct offensive meaning used 5 or more times throughout the corpus**

In [8]:
# load the offensive dataframe pulled from the master
offensivedf = pd.read_csv("../../data/conditioned/noun_n-gram_offensive.csv")  

In [9]:
offensivedf.shape

(20, 2)

In [11]:
offensivedf.head(20)

Unnamed: 0,count,word
0,143,shit
1,134,bitch
2,128,ass
3,127,nigga
4,50,niggas
5,40,dick
6,35,ho
7,30,booty
8,28,pussy
9,24,bone


In [13]:
offensives = offensivedf['word'].values
len(offensives)

20

In [14]:
print "{}".format(('butt' in offensives))

True


##Load Vocab

In [6]:
# function to ensure elements in list are ascii
def listAsAscii(lst):
    return [x.encode('ascii','ignore') if isinstance(x, unicode) else x for x in lst]

In [7]:
# function for loading dictionary json to columnar dataframe
def jsonDictToDataframe(json_name, key_col_label="key", val_col_label="value", root_in=root_in):
    # read to json
    with open(root_in + json_name, 'r') as fp:
        j = json.load(fp)
    
    d = {key_col_label: listAsAscii(j.keys()), val_col_label: listAsAscii(j.values())}
    return pd.DataFrame(data=d)  

In [29]:
# function for saving dataframe to csv
def dataframeToCsv(df, csv_name, root_out=root_out, index=False):
    df.to_csv(root_out+csv_name,index=index)

In [22]:
decades = [1970,1980,1990,2000,2010]

In [17]:
# set up a structure for computing profanity
ncomp = {}

# initialize ncomp to hold all words with 0 value for each decade
for x in offensives:    
    ncomp[x]=[0,0,0,0,0]   

In [20]:
# get the count for each word

def populateDecadeWords(comp,decade,json_name):
    
    # set decade
    didx = decades.index(decade)
    
    # change root in for decade
    drootin = "../../data/conditioned/decades/"+str(decade)+"/"
    
    # read to json
    with open(drootin + json_name, 'r') as fp:
        j = json.load(fp)
    
    #set decade value for each in j
    for x in j:
        if x[0] in comp:
            comp[x[0]][didx] = x[1]
    
    return comp

In [23]:
# ncomp for nouns
for d in decades:
    ncomp = populateDecadeWords(ncomp,d,'noun_n-gram_reduced.json')

In [24]:
#verify ncomp
print ncomp.keys()[0]
print ncomp[ncomp.keys()[0]]

ass
[0, 1, 32, 68, 27]


In [25]:
# populate a column full of a given decades values from a comp
def compCol(comp,decade):
    didx = decades.index(decade)
    vs = []
    for k,v in comp.iteritems():
        vs.append(v[didx])
        
    return vs

# function to convert comp to dataframe and save
def compToDataframe(comp):
    d = {'word': listAsAscii(comp.keys())}
    
    for decade in decades:
        d[str(decade)] = compCol(comp,decade)
    
    return pd.DataFrame(data=d)

In [26]:
ncompdf = compToDataframe(ncomp)

In [27]:
ncompdf.head()

Unnamed: 0,1970,1980,1990,2000,2010,word
0,0,1,32,68,27,ass
1,0,0,37,58,32,nigga
2,1,0,10,15,4,booty
3,0,0,1,11,1,pimpin
4,0,0,6,14,2,pimp


###Save Profanity Dataframe

In [30]:
# ncompdf
dataframeToCsv(ncompdf,'noun_decade_offensives_reduced.csv')