In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
import squarify
from wordcloud import WordCloud
from spellchecker import SpellChecker
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
%matplotlib inline

custom_style = {'axes.titlesize':'15',
                'axes.titlecolor':'white',
                'xtick.color': 'black',
                'ytick.color': 'black'}
sns.set(style= 'darkgrid', rc=custom_style)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/livernteo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
sqf = pd.read_csv('2019_sqf_race_demenour.csv')
sqf.head()

Unnamed: 0,stop_id,suspect_race_description,demeanor_of_person_stopped
0,2019-1,BLACK,UNDERSTANDING
1,2019-2,BLACK,NORMAL
2,2019-3,WHITE,CALM
3,2019-4,BLACK,CONFUSED
4,2019-6,BLACK,COORPERATIVE


In [3]:
# checking for empty cells 
sqf.describe()

Unnamed: 0,stop_id,suspect_race_description,demeanor_of_person_stopped
count,13272.0,13184,13218
unique,12313.0,6,1887
top,2020000000000.0,BLACK,CALM
freq,960.0,7854,4037


In [4]:
# removing the whole row if either race or demeanor is empty (small % anyway)
sqf.dropna(how='any',inplace=True)

In [5]:
# splitting the demeanor to string
sqf['split'] = sqf['demeanor_of_person_stopped'].apply(lambda x : x.split())

In [6]:
# order alphabetically
sqf.sort_values(by='demeanor_of_person_stopped',inplace=True)

In [7]:
# correcting 2 obvious spelling mistakes (had unique symbol)
sqf['demeanor_of_person_stopped'].loc[2790] = 'UNCOOPERATIVE'
sqf['demeanor_of_person_stopped'].loc[5875] = 'UPSET, COOPERATIVE'

In [8]:
# find and replace instances of NON with NON-
sqf['demeanor_of_person_stopped'] = sqf['demeanor_of_person_stopped'].apply(
    lambda x: x.replace("NON ", "NON-"))

In [9]:
sqf['split'] = sqf['demeanor_of_person_stopped'].apply(lambda x : re.findall('[a-zA-Z\-]+',x))
sqf

Unnamed: 0,stop_id,suspect_race_description,demeanor_of_person_stopped,split
6864,2019-7520,BLACK,"""AFRAID""",[AFRAID]
11572,2019-12676,WHITE HISPANIC,"""CALM""",[CALM]
1882,2019-2014,WHITE HISPANIC,- ANGRY AND SLIGHTLY UNCOOPERATIVE -,"[-, ANGRY, AND, SLIGHTLY, UNCOOPERATIVE, -]"
2790,2019-3035,BLACK,UNCOOPERATIVE,[UNCOOPERATIVE]
5875,2019-6423,BLACK,"UPSET, COOPERATIVE","[UPSET, COOPERATIVE]"
...,...,...,...,...
7575,2019-8318,BLACK,YELLING,[YELLING]
8829,2019-9738,BLACK,YELLING,[YELLING]
1161,2019-1209,BLACK,YELLING AND LAUGHING,"[YELLING, AND, LAUGHING]"
1160,2019-1208,BLACK,YELLING AND LAUGHING,"[YELLING, AND, LAUGHING]"


In [10]:
# create spellchecker object
spell = SpellChecker()

In [11]:
# create a function that does the spell check 

def spellcheck(split_array): 
    return_array = []
    for word in split_array:
        return_array.append(spell.correction(word).upper())
    return return_array

In [None]:
# spell check the split array and return array of correct spelling 
sqf['spellcheck_demeanor'] = sqf['split'].apply(lambda x: spellcheck(x))
sqf

In [None]:
# function to count number of words 
def word_count(total_words_array):
    counts = dict()
    
    for word in total_words_array:
        if word in counts: 
            counts[word] += 1
        else: 
            counts[word] = 1
    return counts 

In [None]:
# creating a list of stop words
stop_words = stopwords.words('english')
stop_words = [x.upper() for x in stop_words]

In [None]:
# function that will accept a series and produce a dataframe of word counts for the series
def race_word_count(spellcheck_demeanor_race):
    ''' accept a series and produce a dataframe of word counts for the series '''
    words_array = []
    
    for row in spellcheck_demeanor_race: 
        for r in row:
            if r not in stop_words: 
                words_array.append(r)
                
    df_word_count = pd.DataFrame.from_dict(word_count(words_array),
                                       orient='Index',columns=['Count'])

    return df_word_count.sort_values('Count',ascending=False)
    

In [None]:
# word count for suspects by race 
black_word_count = race_word_count(
    sqf[sqf['suspect_race_description'] == 'BLACK']['spellcheck_demeanor'])
white_word_count = race_word_count(
    sqf[sqf['suspect_race_description'] == 'WHITE']['spellcheck_demeanor'])
black_hispanic_word_count = race_word_count(
    sqf[sqf['suspect_race_description'] == 'BLACK HISPANIC']['spellcheck_demeanor'])
white_hispanic_word_count = race_word_count(
    sqf[sqf['suspect_race_description'] == 'WHITE HISPANIC']['spellcheck_demeanor'])
asian_pacific_word_count = race_word_count(
    sqf[sqf['suspect_race_description'] == 'ASIAN / PACIFIC ISLANDER']['spellcheck_demeanor'])
native_alaskan_word_count = race_word_count(
    sqf[sqf['suspect_race_description'] == 'AMERICAN INDIAN/ALASKAN N']['spellcheck_demeanor'])

In [None]:
# renaming the count columns; individual race datasetes 
white_word_count.rename(columns={'Count':'white_count'},inplace=True)
black_word_count.rename(columns={'Count':'black_count'},inplace=True)
white_hispanic_word_count.rename(columns={'Count':'white_hispanic_count'},inplace=True)
black_hispanic_word_count.rename(columns={'Count':'black_hispanic_count'},inplace=True)
asian_pacific_word_count.rename(columns={'Count':'asian_pacific_count'},inplace=True)
native_alaskan_word_count.rename(columns={'Count':'native_alaskan_count'},inplace=True)


In [None]:
# complete dataframe of all records
df_total_count = pd.concat([white_word_count, black_word_count, white_hispanic_word_count,
                           black_hispanic_word_count, asian_pacific_word_count, 
                           native_alaskan_word_count], axis=1)
df_total_count.head(10)
# df_total_count.to_csv('2019_word_cloud.csv')

In [None]:
df_tree = pd.DataFrame(sqf.groupby(['suspect_race_description']).count()['stop_id'])
df_tree.reset_index(inplace=True)
df_tree.columns = ['suspect_race_description','stop_count']

In [None]:
fig, ax = plt.subplots(1, figsize = (12,12))
squarify.plot(sizes=df_tree['stop_count'],
              label=df_tree['suspect_race_description'],
             alpha=0.8)

In [None]:
def string_words(spellcheck_demeanor_race):
    '''function accepts a series and returns a string of words'''
    string_words = ""

    for row in series: 
        for r in row:
            if r not in stop_words: 
                  string_words = string_words + " " + r
    
    return string_words

In [None]:
import random
def black_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "hsl(0, 1%, 30%)"

In [None]:
series = sqf[sqf['suspect_race_description'] == 'WHITE']['spellcheck_demeanor']
series

In [None]:
# checking the top 50 words for each race and removing the ones that aren't an emotion
white_word_count.head(55)

# words that should be removed "extremely", "apparently", "highly", "profusely"
white_string_words = string_words(sqf[sqf['suspect_race_description'] == 'WHITE']['spellcheck_demeanor'])
white_string_words = white_string_words.replace("EXTREMELY", " ")
white_string_words = white_string_words.replace("APPARENTLY", " ")
white_string_words = white_string_words.replace("HIGHLY", " ")
white_string_words = white_string_words.replace("PROFUSELY", " ")
white_string_words
wordcloud = WordCloud(width = 720, height = 1200, font_path='Arial',
                      background_color=None, max_words = 25,
                      mode='RGBA',
                      prefer_horizontal=0.9,
                      collocations=False, color_func= black_color_func).generate(white_string_words)
wordcloud.to_file("white_cloud.png")
plt.figure(figsize = (15, 11), facecolor = None) 
plt.imshow(wordcloud) 
plt.grid(False)
plt.title('White Word Cloud')
plt.show()

In [None]:
# checking the top 50 words for each race and removing the ones that aren't an emotion
black_word_count.head(55)

# words that should be removed "apparently", "highly", "stop", "person"
black_string_words = string_words(sqf[sqf['suspect_race_description'] == 'WHITE']['spellcheck_demeanor'])
black_string_words = black_string_words.replace("STOP", " ")
black_string_words = black_string_words.replace("APPARENTLY", " ")
black_string_words = black_string_words.replace("HIGHLY", " ")
black_string_words = black_string_words.replace("PERSON", " ")
black_string_words
wordcloud = WordCloud(width =1500, height = 900, font_path='Arial',
                      background_color=None, max_words = 25,
                      mode='RGBA',
                      prefer_horizontal=0.85,
                      collocations=False, color_func= black_color_func).generate(black_string_words)
wordcloud.to_file("black_cloud.png")
plt.figure(figsize = (15, 11), facecolor = None) 
plt.imshow(wordcloud) 
plt.grid(False)
plt.title('White Word Cloud')
plt.show()

In [None]:
output = string_words(series)

wordcloud = WordCloud(width =800, height = 800, 
                      background_color='white', max_words = 50,
                      collocations=False).generate(output)
plt.figure(figsize = (10, 10), facecolor = None) 
plt.imshow(wordcloud) 
plt.grid(False)
plt.show()

In [None]:
white_word_count.head(55)

In [None]:
series = sqf[sqf['suspect_race_description'] == 'WHITE']['spellcheck_demeanor']
output = string_words(series)

wordcloud = WordCloud(width =800, height = 800, 
                      background_color='white', max_words = 50,
                      collocations=False).generate(output)
plt.figure(figsize = (10, 10), facecolor = None) 
plt.imshow(wordcloud) 
plt.grid(False)
plt.show()