## Kit Data Science

## Homework Lesson 5 - Pandas Data Cleaning

## 1. Data

In [35]:
import numpy as np
import pandas as pd
import requests
import time

In [6]:
pathfile = "https://raw.githubusercontent.com/fspot/INFMDI-721/master/lesson5/products.csv"
df = pd.read_csv(pathfile, sep=';')

In [40]:
print(df.shape)

df.head()

(200, 5)


Unnamed: 0,username,ip_address,product,price,infos
0,ldrover0,666.666.666.666,Clam - Cherrystone,712.8,May contain sugar
1,kizakov1,nope,Soup - Campbells Bean Medley,379.26,Contains peanut and fish
2,abromet2,240.177.79.234,Island Oasis - Lemonade,305.96,Ingredients: mustard and fish
3,kkarolowski3,26.191.237.49,"Water - Mineral, Natural",350.15,Contains gluten
4,mbuckney4,58.90.204.239,Radish - Pickled,949.79,"May contain sugar, egg and fish"


## 2. Cleaning Currency Column

    On aimerait avoir une colonne de prix unifiés en euros. Problème: la currency n'est
    pas indiquée pour tous les produits: il va falloir essayer de "deviner" les currency
    manquantes, en se basant sur l'adresse IP de l'utilisateur.

We will be using the [IP Geolocation API](https://ipgeolocationapi.com/) for getting the currency based off the user's IP Address. This API was chosen because no authentification token was required.

In [24]:
IP_GEOLOC_API = "https://api.ipgeolocationapi.com/geolocate/{}"

In [29]:
# EDA
df['ip_address'].head()

0    666.666.666.666
1               nope
2     240.177.79.234
3      26.191.237.49
4      58.90.204.239
Name: ip_address, dtype: object

In [26]:
# Test
testIP = "26.191.237.49"
IP_GEOLOC_GET = IP_GEOLOC_API.format(testIP)
request = requests.get(IP_GEOLOC_GET)
requestJSON = request.json()
requestJSON['currency_code']

'USD'

In [43]:
def getCurrencyFromIP(x):
    # Request from IP GEOLOCATION API
    GEOLOC_TEMP = IP_GEOLOC_API.format(x)
    request = requests.get(GEOLOC_TEMP)
    
    # Get Currency
    if request.status_code == 200:
        try:
            currency = request.json()['currency_code']
        except:
            currency = "unknown"
    else:
        currency = "unknown"
    
    time.sleep(1)
    return currency

print(getCurrencyFromIP(testIP))
print(getCurrencyFromIP("nope"))

USD
unknown


In [44]:
%%time
df['currency'] = df['ip_address'].apply(lambda x: getCurrencyFromIP(x))

Wall time: 3min 58s


In [50]:
df.currency.value_counts()

USD        71
unknown    43
EUR        18
CNY        14
JPY        11
BRL         6
KRW         3
INR         3
TWD         3
RUB         3
MXN         3
CAD         2
DKK         2
GBP         2
RON         1
BGN         1
AUD         1
TRY         1
IDR         1
TND         1
BYN         1
KZT         1
VND         1
ZAR         1
IRR         1
NGN         1
PEN         1
MYR         1
CHF         1
AOA         1
Name: currency, dtype: int64

## 3. Cleaning Infos Column

    La colonne "infos" liste des ingrédients présents dans le produit. On préfèrerait
    avoir une colonne de type bool par ingrédient, indiquant si le produit contient ou
    non cet ingrédient.


In [115]:
import string
import nltk
# from textblob import TextBlob
# nltk.download('averaged_perceptron_tagger')

In [145]:
df['infos'].head(10)

0                           May contain sugar
1                    Contains peanut and fish
2               Ingredients: mustard and fish
3                             Contains gluten
4             May contain sugar, egg and fish
5                 Ingredients: sugar and milk
6                           May contain sugar
7                              Contains sugar
8           Ingredients: sugar, milk and fish
9    May contain peanut, sugar, milk and fish
Name: infos, dtype: object

In [146]:
test = df['infos'].copy()[3]
print(test)
testStr = test.translate(str.maketrans('', '', string.punctuation))
print(testStr)
testTags = nltk.pos_tag(testStr.split(' '))
print(testTags)
testTags = [tag[0] for tag in testTags if tag[1]=='NN']
print(testTags)

Contains gluten
Contains gluten
[('Contains', 'NNS'), ('gluten', 'VBP')]
[]


In [149]:
def getTagsFromInfoDesc(desc):
    desc = desc.lower().translate(str.maketrans('', '', string.punctuation))
    tags = nltk.pos_tag(desc.split(' '))
    # tags = [tag for tag in tags if tag[1]=='NN']
    return tags

getTagsFromInfoDesc(test)

[('contains', 'NNS'), ('gluten', 'VBP')]

In [150]:
df['tags'] = df['infos'].apply(lambda x: getTagsFromInfoDesc(x))
results = df['tags'].copy().tolist()
results = [el[0] for sublist in results for el in sublist]
pd.Series(results).value_counts().sort_index()

and            169
contain         63
contains        60
egg             47
fish            84
gluten          55
ingredients     77
may             63
milk            47
mustard         40
peanut          45
soja            56
sugar          150
dtype: int64

In [151]:
df.head()

Unnamed: 0,username,ip_address,product,price,infos,currency,ingredients,tags
0,ldrover0,666.666.666.666,Clam - Cherrystone,712.8,May contain sugar,unknown,[],"[(may, MD), (contain, VB), (sugar, NN)]"
1,kizakov1,nope,Soup - Campbells Bean Medley,379.26,Contains peanut and fish,unknown,[],"[(contains, NNS), (peanut, NN), (and, CC), (fi..."
2,abromet2,240.177.79.234,Island Oasis - Lemonade,305.96,Ingredients: mustard and fish,unknown,[],"[(ingredients, NNS), (mustard, NN), (and, CC),..."
3,kkarolowski3,26.191.237.49,"Water - Mineral, Natural",350.15,Contains gluten,USD,[],"[(contains, NNS), (gluten, VBP)]"
4,mbuckney4,58.90.204.239,Radish - Pickled,949.79,"May contain sugar, egg and fish",JPY,[],"[(may, MD), (contain, VB), (sugar, NN), (egg, ..."


In [152]:
ingredients = ['egg', 'fish', 'gluten', 'milk', 'mustard', 'peanut', 'soja', 'sugar']

df['tags'] = df['tags'].apply(lambda tags: [tag[0] for tag in tags])
df['ingredients'] = df['tags'].apply(lambda tags: [tag for tag in tags if tag in ingredients])
df.drop('tags', axis=1, inplace=True)

In [153]:
df.head()

Unnamed: 0,username,ip_address,product,price,infos,currency,ingredients
0,ldrover0,666.666.666.666,Clam - Cherrystone,712.8,May contain sugar,unknown,[sugar]
1,kizakov1,nope,Soup - Campbells Bean Medley,379.26,Contains peanut and fish,unknown,"[peanut, fish]"
2,abromet2,240.177.79.234,Island Oasis - Lemonade,305.96,Ingredients: mustard and fish,unknown,"[mustard, fish]"
3,kkarolowski3,26.191.237.49,"Water - Mineral, Natural",350.15,Contains gluten,USD,[gluten]
4,mbuckney4,58.90.204.239,Radish - Pickled,949.79,"May contain sugar, egg and fish",JPY,"[sugar, egg, fish]"


Transform column of lists into one-hot encoded columns:

In [156]:
from sklearn.preprocessing import MultiLabelBinarizer

In [160]:
tagsCol = df['ingredients']
mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(tagsCol), columns=mlb.classes_,
                   index=tagsCol.index)
res.head()

Unnamed: 0,egg,fish,gluten,milk,mustard,peanut,soja,sugar
0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,1,0,0
2,0,1,0,0,1,0,0,0
3,0,0,1,0,0,0,0,0
4,1,1,0,0,0,0,0,1


In [161]:
df1 = pd.concat([df, res], axis=1)
df1.head()

Unnamed: 0,username,ip_address,product,price,infos,currency,ingredients,egg,fish,gluten,milk,mustard,peanut,soja,sugar
0,ldrover0,666.666.666.666,Clam - Cherrystone,712.8,May contain sugar,unknown,[sugar],0,0,0,0,0,0,0,1
1,kizakov1,nope,Soup - Campbells Bean Medley,379.26,Contains peanut and fish,unknown,"[peanut, fish]",0,1,0,0,0,1,0,0
2,abromet2,240.177.79.234,Island Oasis - Lemonade,305.96,Ingredients: mustard and fish,unknown,"[mustard, fish]",0,1,0,0,1,0,0,0
3,kkarolowski3,26.191.237.49,"Water - Mineral, Natural",350.15,Contains gluten,USD,[gluten],0,0,1,0,0,0,0,0
4,mbuckney4,58.90.204.239,Radish - Pickled,949.79,"May contain sugar, egg and fish",JPY,"[sugar, egg, fish]",1,1,0,0,0,0,0,1
