In [1]:
import pandas as pd
import pickle
import nltk
import unirest
import os

from nltk.tag import pos_tag

ImportError: No module named nltk

### (unsuccessfully) Using the CPSC API

In [49]:
key = os.environ.get('CPSC_KEY')
resp = unirest.get('http://www.saferproducts.gov/webapi/Cpsc.Cpsrms.Web.Api.svc/',
                   auth=(key,''), headers={"Accept": "application/json"})

### Cleaning up Raw Data

In [None]:
data = pd.read_json('data/raw_api_data.txt')

cols_to_parse = ['Gender', 'SeverityType', 'Locale', 'ProductCategory']
cols_to_add = [['GenderDescription','GenderId','GenderPublicName'],
['IncidentDetails','SeverityTypeDescription','SeverityTypePublicName'],
['LocaleDescription','LocalePublicName'],
['ProductCategoryDescription','ProductCategoryPublicName']]
new_df = pd.DataFrame()
cols_dict = dict(zip(cols_to_parse, cols_to_add))

for key in cols_dict:
    placeholder = pd.DataFrame(columns = cols_dict[key])
    for (i, row) in data.iterrows():
        e = row[key]
        value_holder = []
        for item in cols_dict[key]:
            try:
                component = e[item]
            except:
                component = 'Missing'
            value_holder.append(component)
        placeholder.loc[i, :] = value_holder
    if new_df.shape[0] == 0:
        new_df = placeholder
    else:
        new_df = pd.concat([new_df, placeholder], axis=1)
        
new_df2 = pd.concat([data, new_df], axis=1)
new_df2 = new_df2.drop(['CompanyComments', 'Gender','IncidentDocuments','IncidentDetails', 'Locale', 'ProductCategory',
                      'RelationshipType','SeverityType', 'SourceType'], axis=1)
new_df2.to_pickle('data/cleaned_api_data')

### Beginning of Analysis

In [7]:
# neiss = pd.read_csv('/NEISS-data-2015-updated-APRIL2016.csv')
data = pickle.load(open('/home/datauser/cpsc/data/cleaned_api_data', 'rb'))
neiss = pd.read_csv('/home/datauser/cpsc/data/NEISS-data-2015-updated-APRIL2016.csv')

products = data.ProductCategoryPublicName.drop_duplicates()

In [8]:
products.head(20)

0         Hair Curlers, Curling Irons, Clips & Hairpins
1                                                 Cribs
2     Electric Ranges or Ovens (Excl Counter-top Ovens)
3                                         Refrigerators
4                                               Diapers
5                                           Pogo Sticks
8                           Coal or Wood-burning Stoves
9                                           Televisions
10       Candles, Candlesticks and Other Candle Holders
11                                Sheets or Pillowcases
12                               Baby Gates or Barriers
13                                             Footwear
15                                 Clothing Accessories
16                                            Batteries
18                                          Light Bulbs
19                                          Dishwashers
20                              Toy Musical Instruments
21                              Lighted Make-up 

In [9]:
class parser(object):
    
    def __init__(self, items_list):
        self.items_list = items_list
        self.product_list = []
        
    @staticmethod
    def paren_split(item):
        return item.split('(')[0]

    def step_one(self, item):
        if len(item.split(' ')) > 1:
            parsed_items = []
            if ')' in item:
                temp_list = self.paren_split(item)
            elif ',' in item:
                temp_list = item.split(',')
            else:
                temp_list = item
            return temp_list
        else:
            return item
        
    @staticmethod
    def step_two(item):
        if isinstance(item, list):
            step_two_results = []
            for each in item:
                if ' or ' in each:
                    results = each.split(' or ')
                elif ' and ' in each:
                    results = each.split(' and ')
                elif ' & ' in each:
                    results = each.split(' & ')
                else:
                    results = each
                step_two_results.append(results)
        else:
            step_two_results = item
        return step_two_results
                    
    def flatten(self):
        temp = [item for sublist in self.parsed if isinstance(sublist, list) for item in sublist]
        return [item.lower().strip() for sublist in temp for item in sublist if isinstance(sublist, list)]
    
    @staticmethod
    def remove_boolean(item):
        remove_criterias = ['other', 'not specified', ',', '.']
        return any(criterion in item for criterion in remove_criterias)
        
    @staticmethod
    def deduplicate_list(raw_list):
        deduped = []
        for i in raw_list:
            if i not in deduped:
                deduped.append(i)
        return deduped
    
    def clean_up_list(self, item_list):
        removed_list = [self.paren_split(item) for item in item_list if not self.remove_boolean(item) and item != '']
        removed_list = self.deduplicate_list(removed_list)
        self.cleaned = removed_list
        return self.cleaned
    
    def run_parser(self):
        results = []
        for item in self.items_list:
            parsed = self.step_one(item)
            results.append(parsed)
        next_step = []
        for item in results:
            parsed = self.step_two(item)
            next_step.append(parsed)
        self.parsed = next_step
        return self.parsed

    def post_parse(self):
        flattened = self.flatten()
        return self.clean_up_list(flattened)
    
    def run(self):
        self.run_parser()
        self.post_parse()
        return self.cleaned

In [10]:
test = parser(products)
test.run()[1:20]

[u'hairpins',
 u'candlesticks',
 u'infant',
 u'toddler play ctrs',
 u'ranges',
 u'ovens',
 u'action figures',
 u'coffee makers',
 u'teapots',
 u'stacking toys',
 u'pull toys',
 u'gas',
 u'lp heaters',
 u'broilers',
 u'toaster ovens',
 u'divans',
 u'studio couches',
 u'housewares',
 u'appliances']

### Natural Language Processing Approach

In [234]:
import nltk
text = neiss.narr1[2]
tokened = nltk.word_tokenize(text)
print(text)
nltk.pos_tag(tokened)

20 YO F C/O EAR PAIN 1 DAY SAS WAS SWIMMING YESTERDAY NOTICED DISCOMFOR


[('20', 'CD'),
 ('YO', 'NNP'),
 ('F', 'NNP'),
 ('C/O', 'NNP'),
 ('EAR', 'NNP'),
 ('PAIN', 'NNP'),
 ('1', 'CD'),
 ('DAY', 'NNP'),
 ('SAS', 'NNP'),
 ('WAS', 'NNP'),
 ('SWIMMING', 'NNP'),
 ('YESTERDAY', 'NNP'),
 ('NOTICED', 'NNP'),
 ('DISCOMFOR', 'NNP')]

In [233]:
text = data.IncidentDescription[2]
tokened = nltk.word_tokenize(text)
print(text)
nltk.pos_tag(tokened)

I have a Frigidaire electric range that comes on without being turned on and the only way to get it to go off is to unplug it.  It is very unsafe and I'm afraid to leave in plugged in when it's not in use.


[(u'I', 'PRP'),
 (u'have', 'VBP'),
 (u'a', 'DT'),
 (u'Frigidaire', 'NNP'),
 (u'electric', 'JJ'),
 (u'range', 'NN'),
 (u'that', 'WDT'),
 (u'comes', 'VBZ'),
 (u'on', 'IN'),
 (u'without', 'IN'),
 (u'being', 'VBG'),
 (u'turned', 'VBN'),
 (u'on', 'IN'),
 (u'and', 'CC'),
 (u'the', 'DT'),
 (u'only', 'JJ'),
 (u'way', 'NN'),
 (u'to', 'TO'),
 (u'get', 'VB'),
 (u'it', 'PRP'),
 (u'to', 'TO'),
 (u'go', 'VB'),
 (u'off', 'RP'),
 (u'is', 'VBZ'),
 (u'to', 'TO'),
 (u'unplug', 'VB'),
 (u'it', 'PRP'),
 (u'.', '.'),
 (u'It', 'PRP'),
 (u'is', 'VBZ'),
 (u'very', 'RB'),
 (u'unsafe', 'JJ'),
 (u'and', 'CC'),
 (u'I', 'PRP'),
 (u"'m", 'VBP'),
 (u'afraid', 'JJ'),
 (u'to', 'TO'),
 (u'leave', 'VB'),
 (u'in', 'IN'),
 (u'plugged', 'VBN'),
 (u'in', 'IN'),
 (u'when', 'WRB'),
 (u'it', 'PRP'),
 (u"'s", 'VBZ'),
 (u'not', 'RB'),
 (u'in', 'IN'),
 (u'use', 'NN'),
 (u'.', '.')]