In [1]:
# Dependencies
import pandas as pd
import requests
import json
from db_key import mysql_user, mysql_password
from pprint import pprint


In [2]:
time_list = ['20190101+TO+20190531', '20120101+TO+20121231',
             '20180101+TO+20180531', '20180601+TO+20181231',
             '20170101+TO+20170531', '20170601+TO+20171231',
             '20160101+TO+20160531', '20160601+TO+20161231',
             '20150101+TO+20150531', '20150601+TO+20151231',
             '20140101+TO+20140531', '20140601+TO+20141231',
             '20130101+TO+20130531', '20130601+TO+20131231',
            ]
frames = []

for time_item in time_list:
    print ("Requesting data for time period: " + time_item)
    url = "https://api.fda.gov/food/enforcement.json?search=report_date:[" + time_item +"]&limit=99"    
    respond = requests.get(url).json()
    df_temp = pd.DataFrame(respond["results"])
    df_clean = df_temp[['state', 'city', 'report_date', 'product_description', 'product_quantity', 'reason_for_recall', 'recalling_firm', 'recall_number']]
    frames.append(df_clean)
    
result = pd.concat(frames)

Requesting data for time period: 20190101+TO+20190531
Requesting data for time period: 20120101+TO+20121231
Requesting data for time period: 20180101+TO+20180531
Requesting data for time period: 20180601+TO+20181231
Requesting data for time period: 20170101+TO+20170531
Requesting data for time period: 20170601+TO+20171231
Requesting data for time period: 20160101+TO+20160531
Requesting data for time period: 20160601+TO+20161231
Requesting data for time period: 20150101+TO+20150531
Requesting data for time period: 20150601+TO+20151231
Requesting data for time period: 20140101+TO+20140531
Requesting data for time period: 20140601+TO+20141231
Requesting data for time period: 20130101+TO+20130531
Requesting data for time period: 20130601+TO+20131231


In [3]:
result.sort_values(by=['report_date'])
result = result.rename(columns={"recalling_firm":"brand"})
result['brand'] = result['brand'].str.lower()
result.head()

Unnamed: 0,state,city,report_date,product_description,product_quantity,reason_for_recall,brand,recall_number
0,TX,Austin,20190306,Goat Cheese Salad with Mandarin Orange and Can...,16 units,Product potentially contaminated with Salmonella.,whole foods market,F-1120-2019
1,TX,Austin,20190306,Focaccia Vegetable Pesto Sandwich packaged in ...,29 units,Product potentially contaminated with Salmonella.,whole foods market,F-1119-2019
2,IL,Chicago,20190220,"RXBAR: Coffee Chocolate, 1.83oz bars, single b...","75,695,266 Individually wrapped bars",Chicago Bar Company LLC d/b/a RXBAR is volunta...,chicago bar company llc rxbar,F-0884-2019
3,IA,West des Moines,20190213,HyVee brand Chocolate-flavored Bettercreme che...,"2,608 32 oz and other similar sizes; 4,367 8 o...",Product contained an ingredient that tested po...,hy-vee stores inc,F-0831-2019
4,CA,Marina Del Rey,20190220,Thrive Market Non-GMO Creamy Almond Butter 16 ...,"152,216 all varieties",Potential Listeria monocytogenes contamination.,thrive market,F-0906-2019


In [4]:
brands = list(result.brand.unique())
brands_df = pd.DataFrame(brands, columns=['brand'])
brands_df.head()

Unnamed: 0,brand
0,whole foods market
1,chicago bar company llc rxbar
2,hy-vee stores inc
3,thrive market
4,oskri corp.


In [5]:
# loading large csv file (963 MB)
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('en.openfoodfacts.org.products.tsv', sep='\t', header=0)

In [6]:
df = df[['product_name', 'brands','ingredients_text', 'countries']]
df = df.loc[df['countries'] == 'US', :]
df = df[['product_name', 'brands','ingredients_text']]
df['brands'] = df['brands'].str.lower()

df = df.dropna()
df = df.reset_index(drop=True)
brands = list(df.brands.unique())
brands_df_new = pd.DataFrame(brands, columns=['brand'])


In [7]:
df = df.rename(columns={"brands":"brand"})
total_brand_list = pd.concat([brands_df, brands_df_new])

In [8]:
total_brand_list.nunique()
brands = list(total_brand_list.brand.unique())

In [9]:
id_brand_table = pd.DataFrame(brands, columns=['brand'])
id_brand_table.head()

Unnamed: 0,brand
0,whole foods market
1,chicago bar company llc rxbar
2,hy-vee stores inc
3,thrive market
4,oskri corp.


In [10]:
id_brand_table = id_brand_table.reset_index()
id_brand_table = id_brand_table.rename(columns={"index":"brand_id"})
id_brand_table.head()

Unnamed: 0,brand_id,brand
0,0,whole foods market
1,1,chicago bar company llc rxbar
2,2,hy-vee stores inc
3,3,thrive market
4,4,oskri corp.


In [11]:
new_df = pd.merge(id_brand_table, result, on='brand', how='outer')
new_df = new_df[['brand_id', 'state', 'city', 'report_date', 'product_description', 'product_quantity', 'reason_for_recall', 'recall_number']]
new_df = new_df.dropna()
new_df.head()


Unnamed: 0,brand_id,state,city,report_date,product_description,product_quantity,reason_for_recall,recall_number
0,0,TX,Austin,20190306,Goat Cheese Salad with Mandarin Orange and Can...,16 units,Product potentially contaminated with Salmonella.,F-1120-2019
1,0,TX,Austin,20190306,Focaccia Vegetable Pesto Sandwich packaged in ...,29 units,Product potentially contaminated with Salmonella.,F-1119-2019
2,0,TX,Austin,20190306,Serbian Ajvar Vegetable Club packaged in plast...,3 units,Product potentially contaminated with Salmonella.,F-1125-2019
3,0,TX,Austin,20190306,Spinach Artichoke Bleus Pizza packaged in plas...,2 units,Product potentially contaminated with Salmonella.,F-1126-2019
4,0,TX,Austin,20190306,Hot Bar Item; Mushroom Truffle Pasta Salad,1 case and 202.81 units,Product potentially contaminated with Salmonella.,F-1132-2019


In [12]:
city = pd.DataFrame(list(new_df.city.unique()),columns=['city'])
city = city.reset_index()
city = city.rename(columns={"index":"city_id"})
new_df_temp = pd.merge(new_df, city, on='city', how='outer')
city.drop(['city_id'], axis=1, inplace=True)

state = pd.DataFrame(list(new_df.state.unique()), columns=['state'])
state = state.reset_index()
state = state.rename(columns={"index":"state_id"})
new_df_temp = pd.merge(new_df_temp, state, on='state', how='outer')
state.drop(['state_id'], axis=1, inplace=True)

reason_for_recall = pd.DataFrame(list(new_df.reason_for_recall.unique()), columns=['reason_for_recall'])
reason_for_recall = reason_for_recall.reset_index()
reason_for_recall = reason_for_recall.rename(columns={"index":"reason_for_recall_id"})
new_df_temp = pd.merge(new_df_temp, reason_for_recall, on='reason_for_recall', how='outer')
reason_for_recall.drop(['reason_for_recall_id'], axis=1, inplace=True)

new_df_temp.drop(['state', 'city', 'reason_for_recall'], axis=1, inplace=True)
new_df = new_df_temp[['recall_number','product_quantity','product_description','report_date','brand_id','reason_for_recall_id','city_id','state_id']]
new_df.head()

Unnamed: 0,recall_number,product_quantity,product_description,report_date,brand_id,reason_for_recall_id,city_id,state_id
0,F-1120-2019,16 units,Goat Cheese Salad with Mandarin Orange and Can...,20190306,0,0,0,0
1,F-1119-2019,29 units,Focaccia Vegetable Pesto Sandwich packaged in ...,20190306,0,0,0,0
2,F-1125-2019,3 units,Serbian Ajvar Vegetable Club packaged in plast...,20190306,0,0,0,0
3,F-1126-2019,2 units,Spinach Artichoke Bleus Pizza packaged in plas...,20190306,0,0,0,0
4,F-1132-2019,1 case and 202.81 units,Hot Bar Item; Mushroom Truffle Pasta Salad,20190306,0,0,0,0


In [13]:
new_df_1 = pd.merge(id_brand_table, df, on='brand', how='outer')
new_df_1 = new_df_1[['brand_id', 'product_name', 'ingredients_text']]
new_df_1 = new_df_1.dropna()
id_brand_table = id_brand_table[['brand']]
id_brand_table.head()

Unnamed: 0,brand
0,whole foods market
1,chicago bar company llc rxbar
2,hy-vee stores inc
3,thrive market
4,oskri corp.


In [14]:
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user=mysql_user,
  passwd=mysql_password)

mycursor = mydb.cursor()

mycursor.execute("CREATE DATABASE IF NOT EXISTS fda_db")

In [15]:
from pandas.io import sql
from sqlalchemy import create_engine
engine = create_engine('mysql+pymysql://' + mysql_user + ':' + mysql_password + '@localhost/fda_db')
with engine.connect() as con, con.begin():
    id_brand_table.to_sql(con=con, name='id_brand', if_exists='replace')
    new_df.to_sql(con=con, name='fda_recalls', if_exists='replace')
    new_df_1.to_sql(con=con, name='open_food_facts', if_exists='replace')
    city.to_sql(con=con, name='city', if_exists='replace')
    state.to_sql(con=con, name='state', if_exists='replace')
    reason_for_recall.to_sql(con=con, name='reason_for_recall', if_exists='replace')