# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Using-the-CPSC-API-(unsuccessfully)" data-toc-modified-id="Using-the-CPSC-API-(unsuccessfully)-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Using the CPSC API (unsuccessfully)</a></div><div class="lev1 toc-item"><a href="#Cleaning-up-Raw-Data" data-toc-modified-id="Cleaning-up-Raw-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Cleaning up Raw Data</a></div><div class="lev2 toc-item"><a href="#API-Data" data-toc-modified-id="API-Data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>API Data</a></div><div class="lev2 toc-item"><a href="#NEISS-Data" data-toc-modified-id="NEISS-Data-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>NEISS Data</a></div><div class="lev1 toc-item"><a href="#Beginning-of-Analysis" data-toc-modified-id="Beginning-of-Analysis-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Beginning of Analysis</a></div><div class="lev3 toc-item"><a href="#Natural-Language-Processing-Approach" data-toc-modified-id="Natural-Language-Processing-Approach-3.0.1"><span class="toc-item-num">3.0.1&nbsp;&nbsp;</span>Natural Language Processing Approach</a></div>

In [61]:
import os
import pickle

import numpy as np
import pandas as pd
import unirest

# Using the CPSC API (unsuccessfully)

Here I try to make a request from the CPSC API and it returns a blank even though the status code is 200.

In [None]:
key = os.environ.get('CPSC_KEY')
resp = unirest.get('http://www.saferproducts.gov/webapi/Cpsc.Cpsrms.Web.Api.svc/',
                   auth=(key,''), headers={"Accept": "application/json"})

# Cleaning up Raw Data

## API Data
Rather than calling from the API, we received the data from the epidemiologist from the CPSC in a raw `.txt` format. However it is a JSON file and there are levels of nesting that we would need to parse out so I created this script below.

In [None]:
data = pd.read_json('data/raw_api_data.txt')

cols_to_parse = ['Gender', 'SeverityType', 'Locale', 'ProductCategory']
cols_to_add = [['GenderDescription','GenderId','GenderPublicName'],
['IncidentDetails','SeverityTypeDescription','SeverityTypePublicName'],
['LocaleDescription','LocalePublicName'],
['ProductCategoryDescription','ProductCategoryPublicName']]
new_df = pd.DataFrame()
cols_dict = dict(zip(cols_to_parse, cols_to_add))

for key in cols_dict:
    placeholder = pd.DataFrame(columns = cols_dict[key])
    for (i, row) in data.iterrows():
        e = row[key]
        value_holder = []
        for item in cols_dict[key]:
            try:
                component = e[item]
            except:
                component = 'Missing'
            value_holder.append(component)
        placeholder.loc[i, :] = value_holder
    if new_df.shape[0] == 0:
        new_df = placeholder
    else:
        new_df = pd.concat([new_df, placeholder], axis=1)
        
new_df2 = pd.concat([data, new_df], axis=1)
new_df2 = new_df2.drop(['CompanyComments', 'Gender','IncidentDocuments','IncidentDetails', 'Locale', 'ProductCategory',
                      'RelationshipType','SeverityType', 'SourceType'], axis=1)
new_df2.to_pickle('data/cleaned_api_data')

## NEISS Data

Combining all the NEISS data from 2009. 2015 was different formatted so I changed it manually and combined all the files into a single data frame. The raw NEISS data wasn't so bad to work with but I figure we could compress it just to save space in the `processed` folder.

In [9]:
#compressing raw data from 2009 to 2014 and saving it in the processed folder
raw_path = os.environ.get('RAW_FILE_PATH')
cleaned_path = os.environ.get('PROCESSED_PATH')
for excel in os.listdir(raw_path):
    data = pd.read_csv(excel)
    data.to_csv(os.path.join(cleaned_path, excel), compression='gzip')

In [62]:
# combining it to a single dataframe
data = pd.DataFrame()
for i in range(2009, 2015):
    filepath = '~/cpsc/data/processed/neiss/neiss-' + str(i) + '.csv'
    temp = pd.read_csv(filepath, compression='gzip')
    temp['year'] = i
    data = pd.concat([data, temp])
data['narr2'] = np.nan

In [78]:
# processing the 2015 manually
latest = pd.read_csv('~/cpsc/data/processed/neiss/neiss-2015.csv', compression='gzip')
latest['year'] = '2015'
columns = latest.columns.values.tolist()
new_cols = columns[:-2]
new_cols.append(columns[-1])
new_cols.append(columns[-2])
final = pd.concat([data, latest.ix[:,new_cols]])
final.to_csv('~/cpsc/data/processed/neiss-combined.csv', compression='gzip')
final.head()

Unnamed: 0.1,CPSC Case #,Unnamed: 0,age,body_part,diag,diag_other,disposition,fmv,location,narr1,...,prod1,prod2,psu,race,race_other,sex,stratum,trmt_date,weight,year
0,90101432,0,5,89,64,,1,0,1,,...,1807,,61,Other / Mixed Race,HISPANIC,Male,V,01-01-09,15.3491,2009
1,90101434,1,51,77,53,,1,0,1,,...,899,,61,White,,Male,V,01-01-09,15.3491,2009
2,90101435,2,2,76,59,,1,0,1,,...,4057,,61,White,,Female,V,01-01-09,15.3491,2009
3,90101436,3,20,93,53,,1,0,1,,...,1884,,61,White,,Male,V,01-01-09,15.3491,2009
4,90101437,4,20,34,57,,1,0,9,,...,3283,,61,White,,Male,V,01-01-09,15.3491,2009


In [79]:
# quick check
final.year.value_counts()

2010    405710
2011    396502
2012    394383
2009    391944
2013    376926
2014    367492
2015    359129
Name: year, dtype: int64

# Beginning of Analysis

Just answering some of the questions the CPSC had on their data. I start with the hackpad. Here I open the cleaned api data using pickle since I saved it in a pickle format (saving it as a csv ran into encoding error and I didn't want to corrupt the data further)

In [82]:
# neiss = pd.read_csv('/NEISS-data-2015-updated-APRIL2016.csv')
data = pickle.load(open('~/cpsc/data/processed/cleaned_api_data', 'rb'))
neiss = pd.read_csv('~/cpsc/data/processed/neiss-combined.csv', compression='gzip')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
class parser(object):
    
    def __init__(self, items_list):
        self.items_list = items_list
        self.product_list = []
        
    @staticmethod
    def paren_split(item):
        return item.split('(')[0]

    def step_one(self, item):
        if len(item.split(' ')) > 1:
            parsed_items = []
            if ')' in item:
                temp_list = self.paren_split(item)
            elif ',' in item:
                temp_list = item.split(',')
            else:
                temp_list = item
            return temp_list
        else:
            return item
        
    @staticmethod
    def step_two(item):
        if isinstance(item, list):
            step_two_results = []
            for each in item:
                if ' or ' in each:
                    results = each.split(' or ')
                elif ' and ' in each:
                    results = each.split(' and ')
                elif ' & ' in each:
                    results = each.split(' & ')
                else:
                    results = each
                step_two_results.append(results)
        else:
            step_two_results = item
        return step_two_results
                    
    def flatten(self):
        temp = [item for sublist in self.parsed if isinstance(sublist, list) for item in sublist]
        return [item.lower().strip() for sublist in temp for item in sublist if isinstance(sublist, list)]
    
    @staticmethod
    def remove_boolean(item):
        remove_criterias = ['other', 'not specified', ',', '.']
        return any(criterion in item for criterion in remove_criterias)
        
    @staticmethod
    def deduplicate_list(raw_list):
        deduped = []
        for i in raw_list:
            if i not in deduped:
                deduped.append(i)
        return deduped
    
    def clean_up_list(self, item_list):
        removed_list = [self.paren_split(item) for item in item_list if not self.remove_boolean(item) and item != '']
        removed_list = self.deduplicate_list(removed_list)
        self.cleaned = removed_list
        return self.cleaned
    
    def run_parser(self):
        results = []
        for item in self.items_list:
            parsed = self.step_one(item)
            results.append(parsed)
        next_step = []
        for item in results:
            parsed = self.step_two(item)
            next_step.append(parsed)
        self.parsed = next_step
        return self.parsed

    def post_parse(self):
        flattened = self.flatten()
        return self.clean_up_list(flattened)
    
    def run(self):
        self.run_parser()
        self.post_parse()
        return self.cleaned

In [None]:
test = parser(products)
test.run()[1:20]

### Natural Language Processing Approach

In [None]:
import nltk
text = neiss.narr1[2]
tokened = nltk.word_tokenize(text)
print(text)
nltk.pos_tag(tokened)

In [None]:
text = data.IncidentDescription[2]
tokened = nltk.word_tokenize(text)
print(text)
nltk.pos_tag(tokened)