# Table of Contents
 <p><div class="lev3 toc-item"><a href="#Using-the-CPSC-API-(unsuccessfully)" data-toc-modified-id="Using-the-CPSC-API-(unsuccessfully)-0.0.1"><span class="toc-item-num">0.0.1&nbsp;&nbsp;</span>Using the CPSC API (unsuccessfully)</a></div><div class="lev3 toc-item"><a href="#Cleaning-up-Raw-Data" data-toc-modified-id="Cleaning-up-Raw-Data-0.0.2"><span class="toc-item-num">0.0.2&nbsp;&nbsp;</span>Cleaning up Raw Data</a></div><div class="lev3 toc-item"><a href="#Beginning-of-Analysis" data-toc-modified-id="Beginning-of-Analysis-0.0.3"><span class="toc-item-num">0.0.3&nbsp;&nbsp;</span>Beginning of Analysis</a></div><div class="lev3 toc-item"><a href="#Natural-Language-Processing-Approach" data-toc-modified-id="Natural-Language-Processing-Approach-0.0.4"><span class="toc-item-num">0.0.4&nbsp;&nbsp;</span>Natural Language Processing Approach</a></div>

In [7]:
import pandas as pd
import pickle
import nltk
import unirest
import os

from nltk.tag import pos_tag

### Using the CPSC API (unsuccessfully)

Here I try to make a request from the CPSC API and it returns a blank even though the status code is 200.

In [None]:
key = os.environ.get('CPSC_KEY')
resp = unirest.get('http://www.saferproducts.gov/webapi/Cpsc.Cpsrms.Web.Api.svc/',
                   auth=(key,''), headers={"Accept": "application/json"})

### Cleaning up Raw Data

Rather than calling from the API, we received the data from the epidemiologist from the CPSC in a raw `.txt` format. However it is a JSON file and there are levels of nesting that we would need to parse out so I created this script below.

In [None]:
data = pd.read_json('data/raw_api_data.txt')

cols_to_parse = ['Gender', 'SeverityType', 'Locale', 'ProductCategory']
cols_to_add = [['GenderDescription','GenderId','GenderPublicName'],
['IncidentDetails','SeverityTypeDescription','SeverityTypePublicName'],
['LocaleDescription','LocalePublicName'],
['ProductCategoryDescription','ProductCategoryPublicName']]
new_df = pd.DataFrame()
cols_dict = dict(zip(cols_to_parse, cols_to_add))

for key in cols_dict:
    placeholder = pd.DataFrame(columns = cols_dict[key])
    for (i, row) in data.iterrows():
        e = row[key]
        value_holder = []
        for item in cols_dict[key]:
            try:
                component = e[item]
            except:
                component = 'Missing'
            value_holder.append(component)
        placeholder.loc[i, :] = value_holder
    if new_df.shape[0] == 0:
        new_df = placeholder
    else:
        new_df = pd.concat([new_df, placeholder], axis=1)
        
new_df2 = pd.concat([data, new_df], axis=1)
new_df2 = new_df2.drop(['CompanyComments', 'Gender','IncidentDocuments','IncidentDetails', 'Locale', 'ProductCategory',
                      'RelationshipType','SeverityType', 'SourceType'], axis=1)
new_df2.to_pickle('data/cleaned_api_data')

In [19]:
# data9 = pd.read_excel('../data/raw/NEISS/NEISS-data-2009-updated-12MAY2015.xlsx')
NEISS = pd.DataFrame()
working_path = os.path.join('/','home','datauser','cpsc','data','raw','NEISS')
os.listdir(working_path)

['NEISS-data-2015-updated-APRIL2016.xlsx',
 'NEISS-data-2014-updated-12MAY2015.xlsx',
 'NEISS-data-2009-updated-12MAY2015.xlsx',
 'NEISS-data-2011-updated-12MAY2015.xlsx',
 'NEISS-data-2013-updated-12MAY2015.xlsx',
 'NEISS-data-2012-updated-12MAY2015.xlsx',
 'NEISS-data-2010-updated-12MAY2015.xlsx']

In [17]:
working

'/home/datauser/cpsc/notebooks'

In [18]:
working_path

'home/datauser/cpsc/data/raw/NEISS'

In [6]:
files = ['NEISS-data-2009-updated-12MAY2015.xlsx', 'NEISS-data-2013-updated-12MAY2015.xlsx',
         'NEISS-data-2010-updated-12MAY2015.xlsx', 'NEISS-data-2014-updated-12MAY2015.xlsx',
         'NEISS-data-2011-updated-12MAY2015.xlsx',  'NEISS-data-2015-updated-APRIL2016.xlsx',
         'NEISS-data-2012-updated-12MAY2015.xlsx']

NEISS-data-2009-updated-12MAY2015.xlsx  NEISS-data-2013-updated-12MAY2015.xlsx
NEISS-data-2010-updated-12MAY2015.xlsx  NEISS-data-2014-updated-12MAY2015.xlsx
NEISS-data-2011-updated-12MAY2015.xlsx  NEISS-data-2015-updated-APRIL2016.xlsx
NEISS-data-2012-updated-12MAY2015.xlsx


In [None]:
re.compile(r'NEISS')

### Beginning of Analysis

This is just a brute force way of parsing out the items.

In [None]:
# neiss = pd.read_csv('/NEISS-data-2015-updated-APRIL2016.csv')
data = pickle.load(open('/home/datauser/cpsc/data/processed/cleaned_api_data', 'rb'))
neiss = pd.read_csv('/home/datauser/cpsc/data/raw/NEISS-data-2015-updated-APRIL2016.csv')


In [None]:
products.head(20)

In [None]:
class parser(object):
    
    def __init__(self, items_list):
        self.items_list = items_list
        self.product_list = []
        
    @staticmethod
    def paren_split(item):
        return item.split('(')[0]

    def step_one(self, item):
        if len(item.split(' ')) > 1:
            parsed_items = []
            if ')' in item:
                temp_list = self.paren_split(item)
            elif ',' in item:
                temp_list = item.split(',')
            else:
                temp_list = item
            return temp_list
        else:
            return item
        
    @staticmethod
    def step_two(item):
        if isinstance(item, list):
            step_two_results = []
            for each in item:
                if ' or ' in each:
                    results = each.split(' or ')
                elif ' and ' in each:
                    results = each.split(' and ')
                elif ' & ' in each:
                    results = each.split(' & ')
                else:
                    results = each
                step_two_results.append(results)
        else:
            step_two_results = item
        return step_two_results
                    
    def flatten(self):
        temp = [item for sublist in self.parsed if isinstance(sublist, list) for item in sublist]
        return [item.lower().strip() for sublist in temp for item in sublist if isinstance(sublist, list)]
    
    @staticmethod
    def remove_boolean(item):
        remove_criterias = ['other', 'not specified', ',', '.']
        return any(criterion in item for criterion in remove_criterias)
        
    @staticmethod
    def deduplicate_list(raw_list):
        deduped = []
        for i in raw_list:
            if i not in deduped:
                deduped.append(i)
        return deduped
    
    def clean_up_list(self, item_list):
        removed_list = [self.paren_split(item) for item in item_list if not self.remove_boolean(item) and item != '']
        removed_list = self.deduplicate_list(removed_list)
        self.cleaned = removed_list
        return self.cleaned
    
    def run_parser(self):
        results = []
        for item in self.items_list:
            parsed = self.step_one(item)
            results.append(parsed)
        next_step = []
        for item in results:
            parsed = self.step_two(item)
            next_step.append(parsed)
        self.parsed = next_step
        return self.parsed

    def post_parse(self):
        flattened = self.flatten()
        return self.clean_up_list(flattened)
    
    def run(self):
        self.run_parser()
        self.post_parse()
        return self.cleaned

In [None]:
test = parser(products)
test.run()[1:20]

### Natural Language Processing Approach

In [None]:
import nltk
text = neiss.narr1[2]
tokened = nltk.word_tokenize(text)
print(text)
nltk.pos_tag(tokened)

In [None]:
text = data.IncidentDescription[2]
tokened = nltk.word_tokenize(text)
print(text)
nltk.pos_tag(tokened)