## Loading Libraries

In [1]:
import pandas as pd
import re
import requests
import time
from bs4 import BeautifulSoup as bs

## Getting Nutrition Data

In [2]:
df = pd.read_csv('data/latest_all.csv')
df.head()

Unnamed: 0,Food code,Main food description,WWEIA Category code,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),"Sugars, total (g)","Fiber, total dietary (g)",Total Fat (g),...,20:1 (g),22:1 (g),18:2 (g),18:3 (g),18:4 (g),20:4 (g),20:5 n-3 (g),22:5 n-3 (g),22:6 n-3 (g),Water (g)
0,11111000,"Milk, whole",1002,"Milk, whole",61,3.15,4.8,5.05,0.0,3.25,...,0.0,0.0,0.12,0.075,0.0,0.0,0.0,0.0,0.0,88.13
1,11320000,Soy milk,1404,Milk substitutes,43,2.6,4.92,3.65,0.2,1.47,...,0.01,0.0,0.584,0.075,0.0,0.0,0.0,0.0,0.0,90.36
2,11350000,"Almond milk, sweetened",1404,Milk substitutes,38,0.42,6.59,6.25,0.4,1.04,...,0.0,0.0,0.208,0.0,0.0,0.0,0.0,0.0,0.0,91.8
3,11360000,Rice milk,1404,Milk substitutes,47,0.28,9.17,5.28,0.3,0.97,...,0.0,0.0,0.305,0.008,0.0,0.0,0.0,0.0,0.0,89.28
4,11370000,Coconut milk,1404,Milk substitutes,31,0.21,2.92,2.5,0.0,2.08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.57


## Scraping the price data from Amazon Fresh

#### Creating Url for each item

In [3]:
## Functiom to replace multiple strings from a string
def replaceMultiple(mainString, toBeReplaces, newString=''):
    # Iterate over the strings to be replaced
    for elem in toBeReplaces :
        # Check if string is in the main string
        if elem in mainString :
            # Replace the string
            mainString = mainString.replace(elem, newString)
    return  mainString

# Replace unnecessariy strings in food item string to be searched on Amazon
replaceString = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '(', ')', '%', ',']

# Amazon fresh URL
url = 'https://www.amazon.com/s/ref=nb_sb_noss_2?url=search-alias%3Damazonfresh&field-keywords=' 

# Item urls dictionary which will contain the food items with their amazon fresh url
item_urls = {}

# For each item in nutrion data we created
for k, v in df['Main food description'].iteritems():
    item = replaceMultiple(v, replaceString)
    item_url = url + item
    if 'NFS' in item or 'NS' in item:
        continue
    item_urls[item] = item_url

### Helper functions to get price per 100 gram of food item from soup object

In [9]:
# To convert a String to float
def convertStringToDollars(price):
    
    # Don't want the counts of items
    if re.search('count', price, re.IGNORECASE):
        return 0
    if re.search('pound', price, re.IGNORECASE):
        p = float(re.findall('\d+\.?\d*', price)[0])
        return p/16
    else:
        p = float(re.findall('\d+\.?\d*', price)[0])
        return p
            

# to get price in grams
def getPricePerGrams(prices):
    p = 0
    
    # If Nothing Found in price return 0
    if len(prices) == 0:
        print("No price data found")
        return -1
    
    if len(prices) < 5:
        print("Not enough price data found")
        return -1
    
    counter = 0
    # Get average price for the first few items found on Amazon Fresh
    for price in prices: 
        # In some cases first string is junk
        if re.search('Department', price, re.IGNORECASE) or counter > 2:
            continue
        else:
            # We try to get the dollar amount from string
            try: 
                p += convertStringToDollars(price)
                if p != 0:
                    counter += 1
            
            # In event an exception occours we just return 
            except:
                
                if counter != 0:
                    return p*3.5274/counter
                else:
                    return 0
    
    if counter == 0:
        return 0
    
    return p*3.5274/counter

## to get Price from soup object
def getPrice(soup):
    text = soup.prettify()
    
#     if re.search('automated access to Amazon', text, re.IGNORECASE):
#         print('Bot detected')
#         return -10

    # In Some cases the item is not found. We return -1 in those scenarios
    if re.search('did not match any products', text, re.IGNORECASE):
        print('Did Not Match Any Products')
        return -1
    
    # Finding prices
    pattern = '<span class="a-size-base a-color-base">(.*?)</span>'
    prices = re.findall(pattern, text)

    return getPricePerGrams(prices=prices)
    

# Scraping script

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

# Dictionary to store final prices
final_prices = {}

# For each item find the final price
# Price =  0, -1 indicates price not found
for item, item_url in item_urls.items():
    
    # requesting URL
    page = requests.get(item_url, headers = headers)
    soup = bs(page.content, 'html.parser')
    
    # Getting price from soup
    cost = getPrice(soup)
    final_prices[item] = cost
    
    print('The Item is: {}, Cost is {}'.format(item, cost))
    
    # Sleeping for 20 seconds
    time.sleep(20)

## Merging Results

In [7]:
sol = {'Main food description': [], 'Price': []}
for k, v in df['Main food description'].iteritems():
    item = replaceMultiple(v, replaceString)
    if item in final_prices.keys():
        sol['Main food description'].append(v)
        sol['Price'].append(final_prices[item])

In [8]:
final = df.merge(pd.DataFrame(sol), how='inner', on='Main food description')

## Removing the Data Not found

In [None]:
df = final[final['Price'] != -1]
df = final[final['Price'] != 0]

In [None]:
df = df.drop(columns=['Food code', 'Main food description', 'WWEIA Category code',
       'WWEIA Category description'])

## Building Preliminary Model

In [14]:
import numpy as np
from sklearn.linear_model import LinearRegression

X = df.drop('Price', axis=1)
lm = LinearRegression()
lm.fit(X, df.Price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
import statsmodels.api as sm

X =  df.drop('Price', axis=1)
y = df["Price"]

# Note the difference in argument order
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.546
Model:,OLS,Adj. R-squared:,0.513
Method:,Least Squares,F-statistic:,16.27
Date:,"Sun, 18 Nov 2018",Prob (F-statistic):,1.09e-110
Time:,18:44:44,Log-Likelihood:,-1803.8
No. Observations:,944,AIC:,3738.0
Df Residuals:,879,BIC:,4053.0
Df Model:,65,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Energy (kcal),-0.0158,0.004,-4.293,0.000,-0.023,-0.009
Protein (g),0.1085,0.029,3.787,0.000,0.052,0.165
Carbohydrate (g),0.0724,0.015,4.976,0.000,0.044,0.101
"Sugars, total (g)",0.0041,0.006,0.688,0.491,-0.008,0.016
"Fiber, total dietary (g)",-0.0640,0.026,-2.471,0.014,-0.115,-0.013
Total Fat (g),0.3261,0.088,3.720,0.000,0.154,0.498
"Fatty acids, total saturated (g)",-0.3197,0.291,-1.097,0.273,-0.892,0.252
"Fatty acids, total monounsaturated (g)",-0.3044,0.329,-0.926,0.354,-0.949,0.340
"Fatty acids, total polyunsaturated (g)",0.0858,1.811,0.047,0.962,-3.468,3.640

0,1,2,3
Omnibus:,959.739,Durbin-Watson:,1.753
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49669.415
Skew:,4.787,Prob(JB):,0.0
Kurtosis:,37.222,Cond. No.,891000.0


## Final Data

In [16]:
df.to_csv('model_data.csv', index=False)