In [1]:
# google sheets api setup - copied from incidentsv4.py

from googleapiclient import discovery
import oauth2client, os, httplib2
from oauth2client import client
from oauth2client import tools
from oauth2client.file import Storage

# IDs for google spreadsheets of each year
SPREADSHEET_IDS = {2013: '1ArisyAjhUE1eeuA490-rPPI1nfft2cJIyDpaeOBqyj8',
                   2014: '1699_rxlNIK3KSNzqpoczw0ehiwTp4IKEaEP_dfWo6vM',
                   2015: '1HoG8jdioarEbxVI_IbuqRwQFCFqbUxzCHc6T2SymRUY',
                   2016: '19wsyttAqa4jbPnqmxQWbu79rwzp3eq_EHbzzsRiomTU'}
                   
# If modifying these scopes, delete your previously saved credentials
# at ~/.credentials/sheets.googleapis.com-python-quickstart.json
SCOPES = 'https://www.googleapis.com/auth/spreadsheets.readonly'
CLIENT_SECRET_FILE = 'client_secret.json'
APPLICATION_NAME = 'mapping police violence'

def _get_credentials():
    """Gets valid user credentials from storage.

    If nothing has been stored, or if the stored credentials are invalid,
    the OAuth2 flow is completed to obtain the new credentials.

    Returns:
        Credentials, the obtained credential.
    """
    home_dir = os.path.expanduser('~')
    credential_dir = os.path.join(home_dir, '.credentials')
    if not os.path.exists(credential_dir):
        os.makedirs(credential_dir)
    credential_path = os.path.join(credential_dir,
                                   'sheets.googleapis.com-python-quickstart.json')

    store = Storage(credential_path)
    credentials = store.get()
    if not credentials or credentials.invalid:
        flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES)
        flow.user_agent = APPLICATION_NAME
        if flags:
            credentials = tools.run_flow(flow, store, flags)
        else: # Needed only for compatibility with Python 2.6
            credentials = tools.run(flow, store)
        print('Storing credentials to ' + credential_path)
    return credentials

def _setup():
    credentials = _get_credentials()
    http = credentials.authorize(httplib2.Http())
    discoveryUrl = ('https://sheets.googleapis.com/$discovery/rest?version=v4')
    service = discovery.build('sheets', 'v4', http=http, discoveryServiceUrl=discoveryUrl)
    return service

In [2]:
# other imports
import pandas as pd
import numpy as np
import datetime
from collections import OrderedDict

In [3]:
# import 2013 spreadsheet

service = _setup()
spreadsheetId = SPREADSHEET_IDS[2013]
rangeName = 'clean'
result = service.spreadsheets().values().get(spreadsheetId=spreadsheetId, range=rangeName, majorDimension = 'COLUMNS').execute()
values = result.get('values', [])

dict2013 = OrderedDict()

for column in values:
    dict2013[column[0]] = pd.Series(column[1:])
    
df2013 = pd.DataFrame(dict2013)

# import 2014 spreadsheet

spreadsheetId = SPREADSHEET_IDS[2014]
rangeName = 'clean'
result = service.spreadsheets().values().get(spreadsheetId=spreadsheetId, range=rangeName, majorDimension = 'COLUMNS').execute()
values = result.get('values', [])

dict2014 = OrderedDict()

for column in values:
    dict2014[column[0]] = pd.Series(column[1:])

df2014 = pd.DataFrame(dict2014)

# import 2015 spreadsheet

spreadsheetId = SPREADSHEET_IDS[2015]
rangeName = 'combined'
result = service.spreadsheets().values().get(spreadsheetId=spreadsheetId, range=rangeName, majorDimension = 'COLUMNS').execute()
values = result.get('values', [])

dict2015 = OrderedDict()

for column in values:
    if column[0] == 'solr_filter':
        continue
    dict2015[column[0]] = pd.Series(column[1:])

df2015 = pd.DataFrame(dict2015)

# import 2016 spreadsheet

spreadsheetId = SPREADSHEET_IDS[2016]
rangeName = 'combined'
result = service.spreadsheets().values().get(spreadsheetId=spreadsheetId, range=rangeName, majorDimension = 'COLUMNS').execute()
values = result.get('values', [])

dict2016 = OrderedDict()

for column in values:
    dict2016[column[0]] = pd.Series(column[1:])
    
df2016 = pd.DataFrame(dict2016)

In [4]:
allnames = pd.concat([df2013,df2014,df2015,df2016])
allnames['number'] = pd.to_numeric(allnames['number'])
allnames.set_index('number', inplace=True)
allnames.replace(u'', np.nan, inplace=True)

In [5]:
number = []
name = []
listed_names = []
age = []
date = []
city = []
state = []
was_unarmed = []
in_mpv = []
in_guardian = []
in_wapo = []

for i in sorted(list(set(allnames.index.values))):
    number.append(i)
    
    # if only one source, use that source
    if isinstance(allnames['source'][i], basestring):
        name += [allnames['name'][i]]
        listed_names += [{allnames['source'][i] : allnames['name'][i]}]
        age += [allnames['age'][i]]
        date += [allnames['date'][i]]
        city += [allnames['city'][i]]
        state += [allnames['state'][i]]
        was_unarmed += [{allnames['source'][i] : allnames['armed'][i]}]
        in_mpv += [allnames['source'][i] == u'MPV']
        in_guardian += [allnames['source'][i] == u'Guardian']
        in_wapo += [allnames['source'][i] == u'WaPo']
    
    # if multiple sources, use the date and name used in the query,
    # list alternate names in other_names
    # use age and city listed by MPV; if not listed in MPV, use Guardian; if not in Guardian, use WaPo
    # list all categorizations of was_unarmed
    
    else:
        storycount = [j for j in list(allnames['story_count'][i]) if not pd.isnull(j)][0]
        # storycount is not NaN for the row used in the query
        usedforquery = allnames[allnames['story_count']==storycount].loc[i]
            
        sources = list(allnames['source'][i])
        
        name += [usedforquery['name']]
        
        namedictionary = {}
        for s in sources:
            namedictionary[s] = allnames[allnames['source'] == s].loc[i]['name']
            
        listed_names += [namedictionary]
        
        date += [usedforquery['date']]
        
        if u'MPV' in sources:
            sourceuse = u'MPV'
        elif u'Guardian' in sources:
            sourceuse = u'Guardian'
        elif u'WaPo' in sources:
            sourceuse = u'WaPo'
        else:
            raise ValueError('source is not MPV, nor Guardian, nor WaPo')
        
        age += [allnames[allnames['source'] == sourceuse].loc[i]['age']]
        city += [allnames[allnames['source'] == sourceuse].loc[i]['city']]
        state += [allnames[allnames['source'] == sourceuse].loc[i]['state']]
        
        armeddictionary = {}
        for s in sources:
            armeddictionary[s] = allnames[allnames['source'] == s].loc[i]['armed']
            
        was_unarmed += [armeddictionary]
        
        in_mpv += [u'MPV' in sources]
        in_guardian += [u'Guardian' in sources]
        in_wapo += [u'WaPo' in sources]
    
compileddict = OrderedDict([
                            ('number', number),
                            ('name', name),
                            ('listed_name_variants', listed_names),
                            ('age', age),
                            ('date', date),
                            ('city', city),
                            ('state', state),
                            ('armed?', was_unarmed),
                            ('in_mpv', in_mpv),
                            ('in_guardian', in_guardian),
                            ('in_wapo', in_wapo)
                            ])

compiled = pd.DataFrame(compileddict)
compiled.set_index('number', inplace=True)
compiled['date'] = pd.to_datetime(compiled['date'])
compiled.loc[compiled[compiled['age'] == 'Unknown'].index.values, 'age'] = np.nan
compiled['age'] = pd.to_numeric(compiled['age'])
compiled['race'] = 'Black'
compiled.sort_values('date', inplace = True)

In [6]:
compiled

Unnamed: 0_level_0,name,listed_name_variants,age,date,city,state,armed?,in_mpv,in_guardian,in_wapo,race
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
57,Xavier Tyrell Johnson,{u'MPV': u'Xavier Tyrell Johnson'},31,2013-01-04,Westwood Lakes,FL,{u'MPV': u'Unarmed'},True,False,False,Black
41,Quintine Barksdale,{u'MPV': u'Quintine Barksdale'},43,2013-01-13,Phoenix,AZ,{u'MPV': u'Unarmed'},True,False,False,Black
49,Steven Askew,{u'MPV': u'Steven Askew'},24,2013-01-17,Memphis,TN,{u'MPV': u'Unclear'},True,False,False,Black
20,Donovan Thomas,{u'MPV': u'Donovan Thomas'},22,2013-01-19,St. Louis,MO,{u'MPV': u'Unarmed'},True,False,False,Black
4,Anthony Dwayne Harris,{u'MPV': u'Anthony Dwayne Harris'},38,2013-01-28,Jacksonville,FL,{u'MPV': u'Unarmed'},True,False,False,Black
33,Kayla Moore,{u'MPV': u'Kayla Moore'},41,2013-02-13,Berkeley,CA,{u'MPV': u'Unarmed'},True,False,False,Black
10,Charles A. Baker Jr.,{u'MPV': u'Charles A. Baker Jr.'},30,2013-02-16,Jamestown,NY,{u'MPV': u'Unarmed'},True,False,False,Black
12,Clinton Roebexar Allen,{u'MPV': u'Clinton Roebexar Allen'},25,2013-03-10,Dallas,TX,{u'MPV': u'Unarmed'},True,False,False,Black
46,Russell Lydell Smith,{u'MPV': u'Russell Lydell Smith'},51,2013-03-22,Seattle,WA,{u'MPV': u'Unarmed'},True,False,False,Black
29,Jermaine C. Coleman Jr.,{u'MPV': u'Jermaine C. Coleman Jr.'},20,2013-04-04,Dayton,OH,{u'MPV': u'Unclear'},True,False,False,Black


## Add metadata
   * gender
   * cause of death 
   * description of incident (available for MPV)
   * symptoms of mental illness (available for MPV and WaPo)
   * responsible law enforcement agency (available for MPV and Guardian)

In [7]:
# read in original lists of people

# mpv
mpv = pd.read_excel(r'sourcedata/MPVDatasetDownload2015.xlsx')
mpv.rename(inplace=True, columns={'Date of injury resulting in death (month/day/year)' : 'datetime', 
                                  "Victim's race" : 'race',
                                  "Location of death (city)" : 'city',
                                  "Location of death (state)" : 'state'})
mpv['datetime'] = pd.to_datetime(mpv['datetime'])

# guardian
guardian2015 = pd.read_csv(r'sourcedata/list-2015guardian-retrieved20160708.csv')
guardian2016 = pd.read_csv(r'sourcedata/list-2016guardian-retrieved20160708.csv')
guardian = pd.concat([guardian2015, guardian2016])
guardian.reset_index(drop=True, inplace=True)
# guardian stores months as words - convert to numerals and add year/month/day to a combined datetime column
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

for m in guardian.index:
    guardian.set_value(m, 'month', months.index(guardian['month'][m]) + 1)
    
guardian['datetime'] = guardian['year'].map(str) +'-'+ guardian['month'].map(str) + '-' + guardian['day'].map(str) 
guardian['datetime'] = pd.to_datetime(guardian['datetime'])

guardian.rename(inplace=True, columns = {'raceethnicity':'race'})

# wapo
wapo = pd.read_csv(r'sourcedata/list-wapo-retrieved20160708.csv')
wapo.rename(inplace = True, columns = {'date' : 'datetime'})
wapo['datetime'] = pd.to_datetime(wapo['datetime'])
wapo['race'].replace(['A', 'B', 'H', 'N', 'O', 'W'], ['Asian', 'Black', 'Hispanic', 'Unknown', 'Other', 'White'], inplace = True)
# race codings don't seem to be explicity documented by wapo anywhere
# but this is a best-guess based on the codings used here: https://www.washingtonpost.com/graphics/national/police-shootings-2016/

In [8]:
# add in more metadata if available: gender, cause of death, description, mental illness, responsible agency

for i in compiled.index.values:
    
    if compiled['in_mpv'][i]:
        name = compiled['listed_name_variants'][i][u'MPV']
        state = compiled['state'][i]
        
        screen1 = mpv[mpv["Victim's name"] == name]
        screen2 = screen1[screen1["state"] == state]
        
        if len(screen2) > 1:
            raise ValueError('ambiguous names found!')
        
        compiled.loc[i, 'gender'] = screen2["Victim's gender"].iloc[0]
        compiled.loc[i, 'cause_of_death'] = screen2["Cause of death"].iloc[0]
        compiled.loc[i, 'story'] = screen2["A brief description of the circumstances surrounding the death"].iloc[0]
        compiled.loc[i, 'mental_illness?'] = screen2["Symptoms of mental illness?"].iloc[0]
        compiled.loc[i, 'responsible_agency'] = screen2["Agency responsible for death"].iloc[0]
    
    elif compiled['in_guardian'][i]:
        name = compiled['listed_name_variants'][i][u'Guardian']
        state = compiled['state'][i]
        
        screen1 = guardian[guardian['name'] == name]
        screen2 = screen1[screen1['state'] == state]
        
        if len(screen2) > 1:
            raise ValueError('ambiguous names found!')
        
        compiled.loc[i, 'gender'] = screen2['gender'].iloc[0]
        compiled.loc[i, 'cause_of_death'] = screen2['classification'].iloc[0]
        compiled.loc[i, 'responsible_agency'] = screen2['lawenforcementagency'].iloc[0]
        
        if compiled['in_wapo'][i]:
            name = compiled['listed_name_variants'][i][u'WaPo']
            waposcreen1 = wapo[wapo['name'] == name]
            waposcreen2 = waposcreen1[waposcreen1['state'] == state]
            
            if len(waposcreen2) > 1:
                raise ValueError('ambiguous names found!')
                
            compiled.loc[i, 'mental_illness?'] = waposcreen2['signs_of_mental_illness'].iloc[0]
    
    elif compiled['in_wapo'][i]:
        name = compiled['listed_name_variants'][i][u'WaPo']
        state = compiled['state'][i]
        
        screen1 = wapo[wapo['name'] == name]
        screen2 = screen1[screen1['state'] == state]
        
        if len(screen2) > 1:
            raise ValueError('ambiguous names found!')
        
        genderdict = {'M':'Male', 'F':'Female'}
        compiled.loc[i, 'gender'] = genderdict[screen2['gender'].iloc[0]]
        compiled.loc[i, 'cause_of_death'] = screen2['manner_of_death'].iloc[0]
        compiled.loc[i, 'mental_illness?'] = screen2['signs_of_mental_illness'].iloc[0]

  result = lib.scalar_compare(x, y, op)


In [9]:
# big cypress isn't an actual city - it's a national park
# http://www.sun-sentinel.com/local/broward/fl-pedestrian-fatal-20151031-story.html
# seems to indicate yvens seide is from immokalee, florida
compiled.set_value(309, 'city', 'Immokalee')

# bellflower is misspelled as "Bellfower"
compiled.set_value(54, 'city', 'Bellflower')

# liberty city is really a neighborhood of miami
compiled.set_value(175, 'city', 'Miami')

# vermont square is really a neighborhood of los angeles
compiled.set_value(40, 'city', 'Los Angeles')

# these location corrections only appear in the MPV set -- so update the MPV dataset
mpv.set_value(186, 'Location of death (city)', 'Immokalee')
mpv.set_value(2545, 'Location of death (city)', 'Bellflower')
mpv.set_value(2270, 'Location of death (city)', 'Miami')
mpv.set_value(2784, 'Location of death (city)', 'Los Angeles')

compiled.loc[[309, 54, 175, 40]]

Unnamed: 0_level_0,name,listed_name_variants,age,date,city,state,armed?,in_mpv,in_guardian,in_wapo,race,gender,cause_of_death,story,mental_illness?,responsible_agency
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
309,Yvens Seide,{u'MPV': u'Yvens Seide'},33,2015-10-31,Immokalee,FL,{u'MPV': u'no'},True,False,False,Black,Male,Vehicle,"Officer Gary Paul Evelyn, 51, of Big Cypress s...",,Seminole Police Department
54,William Alfred Harvey III,{u'MPV': u'William Alfred Harvey III'},57,2013-10-27,Bellflower,CA,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Two female deputies were called to the scene a...,Yes,Los Angeles County Sheriff’s Department
175,Willie James Sams,{u'MPV': u'Willie James Sams'},21,2014-02-05,Miami,FL,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Taser,Sams was tasered by police during a domestic d...,No,Miami-Dade Police Department
40,Patrick Othro Sullivan,{u'MPV': u'Patrick Othro Sullivan'},74,2013-08-16,Los Angeles,CA,{u'MPV': u'Unclear'},True,False,False,Black,Male,Gunshot,"As reported to the Los Angeles Times, Patrick ...",Unknown,Not reported


   * population of the town/metropolitan area where the death occurred

In [10]:
# read manually-compiled population data
populations = pd.read_csv(r'sourcedata/populations.csv')
populations.set_index('state', inplace=True)
populations.sort_index(inplace=True)
populations['population'] = pd.to_numeric(populations['population'])

In [11]:
pops = []
for n in compiled.index.values:
    city = compiled['city'][n]
    state = compiled['state'][n]
    pops.append(populations[populations['city'] == city].loc[state]['population'])
compiled['population'] = pd.Series(pops, index = compiled.index)
compiled = compiled[['name',
                     'listed_name_variants',
                     'age',
                     'date',
                     'city',
                     'state',
                     'population',
                     'armed?',
                     'in_mpv',
                     'in_guardian',
                     'in_wapo',
                     'race',
                     'gender',
                     'cause_of_death',
                     'story',
                     'mental_illness?',
                     'responsible_agency']]

In [12]:
compiled

Unnamed: 0_level_0,name,listed_name_variants,age,date,city,state,population,armed?,in_mpv,in_guardian,in_wapo,race,gender,cause_of_death,story,mental_illness?,responsible_agency
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
57,Xavier Tyrell Johnson,{u'MPV': u'Xavier Tyrell Johnson'},31,2013-01-04,Westwood Lakes,FL,11838,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Yolanda Thomas and Xavier Tyrell Johnson had j...,No,Miami-Dade Police Department
41,Quintine Barksdale,{u'MPV': u'Quintine Barksdale'},43,2013-01-13,Phoenix,AZ,1445632,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Barksdale was a neighbor to an off-duty state ...,No,Arizona Department of Transportation
49,Steven Askew,{u'MPV': u'Steven Askew'},24,2013-01-17,Memphis,TN,646889,{u'MPV': u'Unclear'},True,False,False,Black,Male,Gunshot,Shot to death after falling asleep in his car ...,No,Memphis Police Department
20,Donovan Thomas,{u'MPV': u'Donovan Thomas'},22,2013-01-19,St. Louis,MO,319294,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,An off-duty St Louis police officer shot and k...,No,St. Louis Metropolitan Police Department
4,Anthony Dwayne Harris,{u'MPV': u'Anthony Dwayne Harris'},38,2013-01-28,Jacksonville,FL,821784,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Police responded to a domestic dispute call in...,No,Jacksonville Sheriff's Office
33,Kayla Moore,{u'MPV': u'Kayla Moore'},41,2013-02-13,Berkeley,CA,112580,{u'MPV': u'Unarmed'},True,False,False,Black,Female,Asphyxiation,Just before midnight on the night of Moore’s d...,Yes,Berkeley Police Department
10,Charles A. Baker Jr.,{u'MPV': u'Charles A. Baker Jr.'},30,2013-02-16,Jamestown,NY,31146,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Taser,Following a traffic stop Baker was being proce...,No,Jamestown Police Department
12,Clinton Roebexar Allen,{u'MPV': u'Clinton Roebexar Allen'},25,2013-03-10,Dallas,TX,1197816,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Police were called to the apartment because wo...,Drug or alcohol use,Dallas Police Department
46,Russell Lydell Smith,{u'MPV': u'Russell Lydell Smith'},51,2013-03-22,Seattle,WA,608660,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Officers serving a warrant on Smith backed him...,Unknown,Bellevue Police Departmen
29,Jermaine C. Coleman Jr.,{u'MPV': u'Jermaine C. Coleman Jr.'},20,2013-04-04,Dayton,OH,141527,{u'MPV': u'Unclear'},True,False,False,Black,Male,Gunshot,Coleman was shot to death and two associates w...,No,Federal Bureau of Investigation


   * in the last 8 weeks:
        - the total number of black people killed by police in that metropolitan area
        - the total number of white people killed by the police in that metropolitan area
        - the total number of people killed by the police in that metropolitan area

In [13]:
# function for getting these numbers

def last8weekcounts(dataset, date, location = None):
    '''retrieves, for the specified dataset, date, and location:
    # of black people killed by police in previous 8 weeks
    # of white people killed by police in previous 8 weeks
    # of people killed by police in previous 8 weeks.
    'location' must be a [city, state] list; if location = None, function returns counts for the entire united states
    '''
    
    bracket = date - datetime.timedelta(56) 
    
    # can't get 8 week data if dataset doesn't include the full 8 weeks
    if bracket + datetime.timedelta(1) < min(dataset['datetime']) or date > max(dataset['datetime']):
        return None, None, None
    
    # extract data from desired timeslice and location
    last8weeks = dataset[dataset['datetime'] <= date]
    last8weeks = last8weeks[last8weeks['datetime'] > bracket]
    
    if location != None:
        city = location[0]
        state = location[1]
        last8weeks = last8weeks[last8weeks['city'] == city]
        last8weeks = last8weeks[last8weeks['state'] == state]
    
    # count
    num = len(last8weeks)
    numblack = len(last8weeks[last8weeks['race'] == 'Black'])
    numwhite = len(last8weeks[last8weeks['race'] == 'White'])
    
    return num, numblack, numwhite

In [14]:
black8weekcounts = []
white8weekcounts = []
total8weekcounts = []

black8weekcountseverywhere = []
white8weekcountseverywhere = []
total8weekcountseverywhere = []

# takes a few minutes to run
for i in compiled.index.values:
    date = compiled['date'][i]
    city = compiled['city'][i]
    state = compiled['state'][i]
    
    # counts in the same city
    
    mpvcounts = last8weekcounts(mpv, date, location = [city, state])
    guardiancounts = last8weekcounts(guardian, date, location = [city, state])
    wapocounts = last8weekcounts(wapo, date, location = [city, state])
    
    blackcounts = {u'MPV':mpvcounts[1], 
                   u'Guardian':guardiancounts[1], 
                   u'WaPo':wapocounts[1]}
    whitecounts = {u'MPV':mpvcounts[2],
                   u'Guardian':guardiancounts[2],
                   u'WaPo':wapocounts[2]}
    totalcounts = {u'MPV':mpvcounts[0],
                   u'Guardian':guardiancounts[0],
                   u'WaPo':wapocounts[0]}
    
    black8weekcounts.append(blackcounts)
    white8weekcounts.append(whitecounts)
    total8weekcounts.append(totalcounts)
    
    # counts everywhere
    
    mpvcountseverywhere = last8weekcounts(mpv, date)
    guardiancountseverywhere = last8weekcounts(guardian, date)
    wapocountseverywhere = last8weekcounts(wapo, date)
    
    blackcountseverywhere = {u'MPV':mpvcountseverywhere[1],
                             u'Guardian':guardiancountseverywhere[1],
                             u'WaPo':wapocountseverywhere[1]}
    whitecountseverywhere = {u'MPV':mpvcountseverywhere[2],
                             u'Guardian':guardiancountseverywhere[2],
                             u'WaPo':wapocountseverywhere[2]}
    totalcountseverywhere = {u'MPV':mpvcountseverywhere[0],
                             u'Guardian':guardiancountseverywhere[0],
                             u'WaPo':wapocountseverywhere[0]}
    
    black8weekcountseverywhere.append(blackcountseverywhere)
    white8weekcountseverywhere.append(whitecountseverywhere)
    total8weekcountseverywhere.append(totalcountseverywhere)
    
compiled['num_black_victims_in_last_8_weeks_in_same_city'] = black8weekcounts
compiled['num_white_victims_in_last_8_weeks_in_same_city'] = white8weekcounts
compiled['num_victims_in_last_8_weeks_in_same_city'] = total8weekcounts

compiled['num_black_victims_in_last_8_weeks_everywhere'] = black8weekcountseverywhere
compiled['num_white_victims_in_last_8_weeks_everywhere'] = white8weekcountseverywhere
compiled['num_victims_in_last_8_weeks_everywhere'] = total8weekcountseverywhere

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [15]:
compiled

Unnamed: 0_level_0,name,listed_name_variants,age,date,city,state,population,armed?,in_mpv,in_guardian,...,cause_of_death,story,mental_illness?,responsible_agency,num_black_victims_in_last_8_weeks_in_same_city,num_white_victims_in_last_8_weeks_in_same_city,num_victims_in_last_8_weeks_in_same_city,num_black_victims_in_last_8_weeks_everywhere,num_white_victims_in_last_8_weeks_everywhere,num_victims_in_last_8_weeks_everywhere
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57,Xavier Tyrell Johnson,{u'MPV': u'Xavier Tyrell Johnson'},31,2013-01-04,Westwood Lakes,FL,11838,{u'MPV': u'Unarmed'},True,False,...,Gunshot,Yolanda Thomas and Xavier Tyrell Johnson had j...,No,Miami-Dade Police Department,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}"
41,Quintine Barksdale,{u'MPV': u'Quintine Barksdale'},43,2013-01-13,Phoenix,AZ,1445632,{u'MPV': u'Unarmed'},True,False,...,Gunshot,Barksdale was a neighbor to an off-duty state ...,No,Arizona Department of Transportation,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}"
49,Steven Askew,{u'MPV': u'Steven Askew'},24,2013-01-17,Memphis,TN,646889,{u'MPV': u'Unclear'},True,False,...,Gunshot,Shot to death after falling asleep in his car ...,No,Memphis Police Department,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}"
20,Donovan Thomas,{u'MPV': u'Donovan Thomas'},22,2013-01-19,St. Louis,MO,319294,{u'MPV': u'Unarmed'},True,False,...,Gunshot,An off-duty St Louis police officer shot and k...,No,St. Louis Metropolitan Police Department,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}"
4,Anthony Dwayne Harris,{u'MPV': u'Anthony Dwayne Harris'},38,2013-01-28,Jacksonville,FL,821784,{u'MPV': u'Unarmed'},True,False,...,Gunshot,Police responded to a domestic dispute call in...,No,Jacksonville Sheriff's Office,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}"
33,Kayla Moore,{u'MPV': u'Kayla Moore'},41,2013-02-13,Berkeley,CA,112580,{u'MPV': u'Unarmed'},True,False,...,Asphyxiation,Just before midnight on the night of Moore’s d...,Yes,Berkeley Police Department,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}"
10,Charles A. Baker Jr.,{u'MPV': u'Charles A. Baker Jr.'},30,2013-02-16,Jamestown,NY,31146,{u'MPV': u'Unarmed'},True,False,...,Taser,Following a traffic stop Baker was being proce...,No,Jamestown Police Department,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}"
12,Clinton Roebexar Allen,{u'MPV': u'Clinton Roebexar Allen'},25,2013-03-10,Dallas,TX,1197816,{u'MPV': u'Unarmed'},True,False,...,Gunshot,Police were called to the apartment because wo...,Drug or alcohol use,Dallas Police Department,"{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 2, u'Guardian': None, u'WaPo': None}","{u'MPV': 36, u'Guardian': None, u'WaPo': None}","{u'MPV': 57, u'Guardian': None, u'WaPo': None}","{u'MPV': 144, u'Guardian': None, u'WaPo': None}"
46,Russell Lydell Smith,{u'MPV': u'Russell Lydell Smith'},51,2013-03-22,Seattle,WA,608660,{u'MPV': u'Unarmed'},True,False,...,Gunshot,Officers serving a warrant on Smith backed him...,Unknown,Bellevue Police Departmen,"{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 0, u'Guardian': None, u'WaPo': None}","{u'MPV': 3, u'Guardian': None, u'WaPo': None}","{u'MPV': 34, u'Guardian': None, u'WaPo': None}","{u'MPV': 49, u'Guardian': None, u'WaPo': None}","{u'MPV': 131, u'Guardian': None, u'WaPo': None}"
29,Jermaine C. Coleman Jr.,{u'MPV': u'Jermaine C. Coleman Jr.'},20,2013-04-04,Dayton,OH,141527,{u'MPV': u'Unclear'},True,False,...,Gunshot,Coleman was shot to death and two associates w...,No,Federal Bureau of Investigation,"{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 2, u'Guardian': None, u'WaPo': None}","{u'MPV': 33, u'Guardian': None, u'WaPo': None}","{u'MPV': 44, u'Guardian': None, u'WaPo': None}","{u'MPV': 129, u'Guardian': None, u'WaPo': None}"



   * Number of articles about the person in the daterange (d-5, d+14)
   * Number of unique publishers that produced articles
   * Number of bitly clicks
   * Number of facebook shares

In [16]:
# read story data
stories = {}
stories[2013] = pd.read_csv(r'data/2013/mpv-controversy-stories.csv', index_col=0, encoding = 'utf-8')
stories[2014] = pd.read_csv(r'data/2014/mpv-controversy-stories.csv', index_col=0, encoding = 'utf-8')
stories[2015] = pd.read_csv(r'data/2015/mpv-controversy-stories.csv', index_col=0, encoding = 'utf-8')
stories[2016] = pd.read_csv(r'data/2016/mpv-controversy-stories.csv', index_col=0, encoding = 'utf-8')

In [17]:
# get coverage info for each person

num_articles = []
num_publishers = []
total_bitly_clicks = []
total_facebook_shares = []

for i in compiled.index.values:
    person = compiled['name'][i]
    year = compiled['date'][i].year
    
    if person in stories[year].index:
    
        personstories = stories[year].loc[person]
        
        if person in personstories.index: # person has more than one story, and personstories is a dataframe
            num_articles.append(len(personstories))
            num_publishers.append(len(set(personstories['media_id'])))
            total_bitly_clicks.append(sum(personstories['bitly_click_count']))
            total_facebook_shares.append(sum(personstories['facebook_share_count']))
            
        else: # person has exactly one story, and personstories is a series
            num_articles.append(1)
            num_publishers.append(1)
            total_bitly_clicks.append(personstories['bitly_click_count'])
            total_facebook_shares.append(personstories['facebook_share_count'])
    
    else: # no stories about person
        num_articles.append(0)
        num_publishers.append(0)
        total_bitly_clicks.append(0)
        total_facebook_shares.append(0)
    
compiled['num_articles'] = num_articles
compiled['num_publishers'] = num_publishers
compiled['total_bitly_clicks'] = total_bitly_clicks
compiled['total_facebook_shares'] = total_facebook_shares

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
compiled

Unnamed: 0_level_0,name,listed_name_variants,age,date,city,state,population,armed?,in_mpv,in_guardian,...,num_black_victims_in_last_8_weeks_in_same_city,num_white_victims_in_last_8_weeks_in_same_city,num_victims_in_last_8_weeks_in_same_city,num_black_victims_in_last_8_weeks_everywhere,num_white_victims_in_last_8_weeks_everywhere,num_victims_in_last_8_weeks_everywhere,num_articles,num_publishers,total_bitly_clicks,total_facebook_shares
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57,Xavier Tyrell Johnson,{u'MPV': u'Xavier Tyrell Johnson'},31,2013-01-04,Westwood Lakes,FL,11838,{u'MPV': u'Unarmed'},True,False,...,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}",2,1,0,172
41,Quintine Barksdale,{u'MPV': u'Quintine Barksdale'},43,2013-01-13,Phoenix,AZ,1445632,{u'MPV': u'Unarmed'},True,False,...,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}",0,0,0,0
49,Steven Askew,{u'MPV': u'Steven Askew'},24,2013-01-17,Memphis,TN,646889,{u'MPV': u'Unclear'},True,False,...,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}",0,0,0,0
20,Donovan Thomas,{u'MPV': u'Donovan Thomas'},22,2013-01-19,St. Louis,MO,319294,{u'MPV': u'Unarmed'},True,False,...,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}",0,0,0,0
4,Anthony Dwayne Harris,{u'MPV': u'Anthony Dwayne Harris'},38,2013-01-28,Jacksonville,FL,821784,{u'MPV': u'Unarmed'},True,False,...,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}",0,0,0,0
33,Kayla Moore,{u'MPV': u'Kayla Moore'},41,2013-02-13,Berkeley,CA,112580,{u'MPV': u'Unarmed'},True,False,...,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}",0,0,0,0
10,Charles A. Baker Jr.,{u'MPV': u'Charles A. Baker Jr.'},30,2013-02-16,Jamestown,NY,31146,{u'MPV': u'Unarmed'},True,False,...,"{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}","{u'MPV': None, u'Guardian': None, u'WaPo': None}",0,0,0,0
12,Clinton Roebexar Allen,{u'MPV': u'Clinton Roebexar Allen'},25,2013-03-10,Dallas,TX,1197816,{u'MPV': u'Unarmed'},True,False,...,"{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 2, u'Guardian': None, u'WaPo': None}","{u'MPV': 36, u'Guardian': None, u'WaPo': None}","{u'MPV': 57, u'Guardian': None, u'WaPo': None}","{u'MPV': 144, u'Guardian': None, u'WaPo': None}",1,1,0,617
46,Russell Lydell Smith,{u'MPV': u'Russell Lydell Smith'},51,2013-03-22,Seattle,WA,608660,{u'MPV': u'Unarmed'},True,False,...,"{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 0, u'Guardian': None, u'WaPo': None}","{u'MPV': 3, u'Guardian': None, u'WaPo': None}","{u'MPV': 34, u'Guardian': None, u'WaPo': None}","{u'MPV': 49, u'Guardian': None, u'WaPo': None}","{u'MPV': 131, u'Guardian': None, u'WaPo': None}",0,0,0,0
29,Jermaine C. Coleman Jr.,{u'MPV': u'Jermaine C. Coleman Jr.'},20,2013-04-04,Dayton,OH,141527,{u'MPV': u'Unclear'},True,False,...,"{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 1, u'Guardian': None, u'WaPo': None}","{u'MPV': 2, u'Guardian': None, u'WaPo': None}","{u'MPV': 33, u'Guardian': None, u'WaPo': None}","{u'MPV': 44, u'Guardian': None, u'WaPo': None}","{u'MPV': 129, u'Guardian': None, u'WaPo': None}",0,0,0,0


In [19]:
# note that there are some NaNs for bitly and facebook counts, because some articles are missing those counts. 
# for instance, Sherman Evans:

stories[2016].loc['Sherman Evans']

Unnamed: 0_level_0,first_name,last_name,sex,date_of_death,age,city,state,cause,population,story_date,stories_id,media_id,media_name,bitly_click_count,facebook_share_count,url,num_sentences
full_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Sherman Evans,,,,2016-06-27,,,,,,2016-07-07 17:07:32,486858212,1747,Daily Mail,0,0,http://www.dailymail.co.uk/wires/ap/article-36...,11
Sherman Evans,,,,2016-06-27,,,,,,2016-07-07 17:13:42,486859514,2,Washington Post,28,0,https://www.washingtonpost.com/video/local/dc-...,4
Sherman Evans,,,,2016-06-27,,,,,,2016-07-07 17:13:42,486859513,2,Washington Post,2,84,https://www.washingtonpost.com/local/public-sa...,46
Sherman Evans,,,,2016-06-27,,,,,,2016-07-07 17:05:01,486860739,14,San Francisco Chronicle,0,0,http://www.sfgate.com/news/crime/article/DC-po...,9
Sherman Evans,,,,2016-06-27,,,,,,2016-07-07 18:40:35,486882057,64866,usatoday.com,45,68,http://rssfeeds.usatoday.com/~/163933386/0/usa...,24
Sherman Evans,,,,2016-06-27,,,,,,2016-07-07 19:11:48,486886311,1747,Daily Mail,86,100,http://www.dailymail.co.uk/news/article-367995...,33


### Article dataset:
   * Name
   * Race
   * date of death
   * Gender
   * (any other details about the death)
      - pull all these from the above person dataset
        
   * Publisher name
   * Bitly clicks to this article
   * facebook shares on this article
   * number of sentences in article 

In [20]:
# note that many articles mention multiple people! not sure how to deal with this yet

allstories = pd.concat([stories[2013], stories[2014], stories[2015], stories[2016]])
allstories = allstories.sort('story_date')
allstories



Unnamed: 0_level_0,first_name,last_name,sex,date_of_death,age,city,state,cause,population,story_date,stories_id,media_id,media_name,bitly_click_count,facebook_share_count,url,num_sentences
full_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Xavier Tyrell Johnson,,,,2013-01-04,,,,,,2013-01-06 16:00:09,97112906,1349,WFOR CBS 4,0,9,http://miami.cbslocal.com/2013/01/06/names-rel...,16
Xavier Tyrell Johnson,,,,2013-01-04,,,,,,2013-01-08 04:26:24,97205207,1349,WFOR CBS 4,0,163,http://miami.cbslocal.com/2013/01/07/family-of...,29
Clinton Roebexar Allen,,,,2013-03-10,,,,,,2013-03-22 04:27:14,104154264,1200,KTVT CBS 11,0,617,http://dfw.cbslocal.com/2013/03/21/dpd-officer...,24
Dylan Samuel-Peters,,,,2013-04-15,,,,,,2013-04-16 05:43:15,109681809,7,The New York Post,0,0,http://www.nypost.com/p/news/local/brooklyn/ny...,37
Dason Peters,,,,2013-04-15,,,,,,2013-04-16 05:43:15,109681809,7,The New York Post,0,0,http://www.nypost.com/p/news/local/brooklyn/ny...,37
Dason Peters,,,,2013-04-15,,,,,,2013-04-16 12:31:02,109788057,184,Riehl World View,0,0,http://riehlworldview.com/2013/04/ny-cop-roset...,6
Dylan Samuel-Peters,,,,2013-04-15,,,,,,2013-04-16 12:31:02,109788057,184,Riehl World View,0,0,http://riehlworldview.com/2013/04/ny-cop-roset...,6
Dason Peters,,,,2013-04-15,,,,,,2013-04-16 15:34:00,109811352,5810,Gothamist,0,5,http://feeds.gothamistllc.com/click.phdo?i=d7e...,4
Dylan Samuel-Peters,,,,2013-04-15,,,,,,2013-04-16 15:34:00,109811352,5810,Gothamist,0,5,http://feeds.gothamistllc.com/click.phdo?i=d7e...,4
Dason Peters,,,,2013-04-15,,,,,,2013-04-16 16:41:15,109834551,8,The Daily News New York,0,112,http://feeds.nydailynews.com/~r/nydnrss/gossip...,47


In [22]:
# need to add num_sentences once topicStoryList() is fixed and i can actually get the sentence counts for each story

articleset = allstories[['stories_id', 
                         'story_date', 
                         'media_id', 
                         'media_name', 
                         'bitly_click_count', 
                         'facebook_share_count', 
                         'url',
                         'num_sentences']]
articleset['associated_name'] = articleset.index.values
articleset.set_index('stories_id', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [23]:
# match to id numbers used in person dataset above
names_to_num = pd.Series(data=compiled.index.values, index=compiled['name'])
articleset['number'] = list(names_to_num[articleset['associated_name']])

# lookup person data
articleset['age'] = list(compiled['age'][articleset['number']])
articleset['date'] = list(compiled['date'][articleset['number']])
articleset['city'] = list(compiled['city'][articleset['number']])
articleset['state'] = list(compiled['state'][articleset['number']])
articleset['population'] = list(compiled['population'][articleset['number']])
articleset['armed?'] = list(compiled['armed?'][articleset['number']])
articleset['in_mpv'] = list(compiled['in_mpv'][articleset['number']])
articleset['in_guardian'] = list(compiled['in_guardian'][articleset['number']])
articleset['in_wapo'] = list(compiled['in_wapo'][articleset['number']])
articleset['race'] = list(compiled['race'][articleset['number']])
articleset['gender'] = list(compiled['gender'][articleset['number']])
articleset['cause_of_death'] = list(compiled['cause_of_death'][articleset['number']])
articleset['mental_illness?'] = list(compiled['mental_illness?'][articleset['number']])
articleset['responsible_agency'] = list(compiled['responsible_agency'][articleset['number']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-

In [24]:
articleset

Unnamed: 0_level_0,story_date,media_id,media_name,bitly_click_count,facebook_share_count,url,num_sentences,associated_name,number,age,...,population,armed?,in_mpv,in_guardian,in_wapo,race,gender,cause_of_death,mental_illness?,responsible_agency
stories_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
97112906,2013-01-06 16:00:09,1349,WFOR CBS 4,0,9,http://miami.cbslocal.com/2013/01/06/names-rel...,16,Xavier Tyrell Johnson,57,31,...,11838,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,No,Miami-Dade Police Department
97205207,2013-01-08 04:26:24,1349,WFOR CBS 4,0,163,http://miami.cbslocal.com/2013/01/07/family-of...,29,Xavier Tyrell Johnson,57,31,...,11838,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,No,Miami-Dade Police Department
104154264,2013-03-22 04:27:14,1200,KTVT CBS 11,0,617,http://dfw.cbslocal.com/2013/03/21/dpd-officer...,24,Clinton Roebexar Allen,12,25,...,1197816,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Drug or alcohol use,Dallas Police Department
109681809,2013-04-16 05:43:15,7,The New York Post,0,0,http://www.nypost.com/p/news/local/brooklyn/ny...,37,Dylan Samuel-Peters,22,1,...,2504700,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Yes,New York Police Department
109681809,2013-04-16 05:43:15,7,The New York Post,0,0,http://www.nypost.com/p/news/local/brooklyn/ny...,37,Dason Peters,16,33,...,2504700,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Yes,New York Police Department
109788057,2013-04-16 12:31:02,184,Riehl World View,0,0,http://riehlworldview.com/2013/04/ny-cop-roset...,6,Dason Peters,16,33,...,2504700,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Yes,New York Police Department
109788057,2013-04-16 12:31:02,184,Riehl World View,0,0,http://riehlworldview.com/2013/04/ny-cop-roset...,6,Dylan Samuel-Peters,22,1,...,2504700,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Yes,New York Police Department
109811352,2013-04-16 15:34:00,5810,Gothamist,0,5,http://feeds.gothamistllc.com/click.phdo?i=d7e...,4,Dason Peters,16,33,...,2504700,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Yes,New York Police Department
109811352,2013-04-16 15:34:00,5810,Gothamist,0,5,http://feeds.gothamistllc.com/click.phdo?i=d7e...,4,Dylan Samuel-Peters,22,1,...,2504700,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Yes,New York Police Department
109834551,2013-04-16 16:41:15,8,The Daily News New York,0,112,http://feeds.nydailynews.com/~r/nydnrss/gossip...,47,Dason Peters,16,33,...,2504700,{u'MPV': u'Unarmed'},True,False,False,Black,Male,Gunshot,Yes,New York Police Department


## export data

In [25]:
compiled.to_csv(r'data/people-dataset.csv', encoding='utf-8')
articleset.to_csv(r'data/article-dataset.csv', encoding='utf-8')