In [46]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def get_soup_from_url(u):
    return BeautifulSoup(requests.get(u).content, 'html.parser')

In [47]:
products = pd.read_csv('products.csv',sep=';')

In [48]:
products

Unnamed: 0,username,ip_address,product,price,infos
0,ldrover0,666.666.666.666,Clam - Cherrystone,712.8,May contain sugar
1,kizakov1,nope,Soup - Campbells Bean Medley,379.26,Contains peanut and fish
2,abromet2,240.177.79.234,Island Oasis - Lemonade,305.96,Ingredients: mustard and fish
3,kkarolowski3,26.191.237.49,"Water - Mineral, Natural",350.15,Contains gluten
4,mbuckney4,58.90.204.239,Radish - Pickled,949.79,"May contain sugar, egg and fish"
...,...,...,...,...,...
195,ccannop5f,174.5.73.129,Nantucket Orange Juice,713.6,"Contains gluten, mustard, sugar, milk and fish"
196,lprovis5g,191.69.45.257,"Jam - Strawberry, 20 Ml Jar",107.78,Contains gluten and sugar
197,iollarenshaw5h,206.30.25.226,"Juice - Apple, 341 Ml",447.17 RUB,May contain gluten and soja
198,mjablonski5i,44.133.211.182,"Mushroom - Chantrelle, Fresh",632.29,Ingredients: soja and sugar


## Mettre la colonne 'infos' sous forme de Boolean en créant une colonne par ingrédient

Commençons par nettoyer la colonne 'infos' en mettant en minuscule et retirant les signes et mots non pertinents

In [112]:
products['infos'] = products.infos.str.lower()\
                    .str.replace(',','')\
                    .str.replace(':','')\
                    .str.replace('and','')\
                    .str.replace('may','')\
                    .str.replace('contains','')\
                    .str.replace('contain','')\
                    .str.replace('ingredients','')

In [113]:
products.head()

Unnamed: 0,username,ip_address,product,price,infos
0,ldrover0,666.666.666.666,Clam - Cherrystone,712.8,sugar
1,kizakov1,nope,Soup - Campbells Bean Medley,379.26,peanut fish
2,abromet2,240.177.79.234,Island Oasis - Lemonade,305.96,mustard fish
3,kkarolowski3,26.191.237.49,"Water - Mineral, Natural",350.15,gluten
4,mbuckney4,58.90.204.239,Radish - Pickled,949.79,sugar egg fish


Utilisation de 'get_dummies'

In [114]:
products = products.join(products.infos.str.get_dummies(' '))

In [115]:
products.head()

Unnamed: 0,username,ip_address,product,price,infos,egg,fish,gluten,milk,mustard,peanut,soja,sugar
0,ldrover0,666.666.666.666,Clam - Cherrystone,712.8,sugar,0,0,0,0,0,0,0,1
1,kizakov1,nope,Soup - Campbells Bean Medley,379.26,peanut fish,0,1,0,0,0,1,0,0
2,abromet2,240.177.79.234,Island Oasis - Lemonade,305.96,mustard fish,0,1,0,0,1,0,0,0
3,kkarolowski3,26.191.237.49,"Water - Mineral, Natural",350.15,gluten,0,0,1,0,0,0,0,0
4,mbuckney4,58.90.204.239,Radish - Pickled,949.79,sugar egg fish,1,1,0,0,0,0,0,1


## Créer une colonne de prix unifiés

Pour obtenir la monnaie du pays à partir de l'@ IP, on utilise l'API 'ipapi.co' qui n'est pas en .json à des fins d'entraînement.

In [49]:
def get_currency_fm_IP(ip): 
    url = f'https://ipapi.co/{ip}/' # utilisation de l'API ipapi retournant entre autre 
                                    # le pays et la monnaie en cours à partir d'une @IP
    soup = get_soup_from_url(url)
    if (soup.find(class_="text-center ip-error") or soup.find(class_="text-center ip-reserved")) :
        return None
    
    elif soup.find_all(class_="key"): 
        for el in soup.find_all(class_="key"):
            if el.text == 'Currency':
                currency = el.find_next_sibling().text
                return currency

Pour obtenir le taux de change, on utilise en revanche une API en .json.

In [131]:
url = 'https://api.exchangerate-api.com/v4/latest/USD'
request = requests.get(url).json()

def convert(currency):
    if currency == None: #quand la currency est inconnue on prend le parti de les considérer comme étant donnés en $
        return 1 
    if type(request['rates'].get(currency)) != float : # quand la currency est inconnue du .json() on estime 
                                                       # par défaut que le prix est en $
        return 1
    else:
        currency_rate = request['rates'].get(currency)
        return currency_rate

Certains 'price' sont exprimés avec des devises sans avoir l'assurance que ces devises soient les bonnes. On fait le choix de les ignorer. On ne garde que les chiffres.

In [130]:
products['price'] = products.price.str[:6]

Création d'un dictionnaire pour récolter toutes les 'currency' du DataFrame à partir de leur @ IP

In [134]:
currency_dict = {'currency':[]}
for el in products.ip_address:
    if el == 'nope' :
        currency_dict['currency'].append(None)
    else :
        currency_dict['currency'].append(get_currency_fm_IP(el))

In [135]:
products['currency'] = pd.DataFrame(currency_dict)

Conversion des prix en fonction de leur 'currency'

In [136]:
products['price in $'] = round(products.price.astype(float) / products.currency.apply(convert).astype(float),2)

In [139]:
products = products.drop(['price','infos'], axis=1)

In [140]:
products.head(20)

Unnamed: 0,username,ip_address,product,egg,fish,gluten,milk,mustard,peanut,soja,sugar,currency,price in $
0,ldrover0,666.666.666.666,Clam - Cherrystone,0,0,0,0,0,0,0,1,,712.8
1,kizakov1,nope,Soup - Campbells Bean Medley,0,1,0,0,0,1,0,0,,379.26
2,abromet2,240.177.79.234,Island Oasis - Lemonade,0,1,0,0,1,0,0,0,,305.96
3,kkarolowski3,26.191.237.49,"Water - Mineral, Natural",0,0,1,0,0,0,0,0,USD,350.15
4,mbuckney4,58.90.204.239,Radish - Pickled,1,1,0,0,0,0,0,1,JPY,8.75
5,bsnozzwell5,226.52.32.70,Oil - Sesame,0,0,0,1,0,0,0,1,,354.33
6,afairholme6,127.197.204.119,Chicken - Tenderloin,0,0,0,0,0,0,0,1,,484.83
7,avowdon7,189.169.17.54,Dc Hikiage Hira Huba,0,0,0,0,0,0,0,1,MXN,5.82
8,epridham8,187.129.113.105,Dried Figs,0,1,0,1,0,0,0,1,MXN,4.59
9,tkendrew9,22.32.234.215,Pop - Club Soda Can,0,1,0,1,0,1,0,1,USD,861.25


In [142]:
products.to_csv('products_cleaned.csv')