# Web Data
In this notebook we will get our hand dirty with some topics of web data feature handling like IP addresses, User agents, etc...

In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import plotly.express as px

%matplotlib inline
sns.set(rc={'figure.figsize':(10, 10)}, font_scale=1.2)

In [2]:
# read Ecommerce Purchases.csv
df = pd.read_csv('../dastasets/Ecommerce Purchases.csv')
df.head()

Unnamed: 0,Address,Lot,AM or PM,Browser Info,Company,Credit Card,CC Exp Date,CC Security Code,CC Provider,Email,Job,IP Address,Language,Purchase Price
0,"16629 Pace Camp Apt. 448\nAlexisborough, NE 77...",46 in,PM,Opera/9.56.(X11; Linux x86_64; sl-SI) Presto/2...,Martinez-Herman,6011929061123406,02/20,900,JCB 16 digit,pdunlap@yahoo.com,"Scientist, product/process development",149.146.147.205,el,98.14
1,"9374 Jasmine Spurs Suite 508\nSouth John, TN 8...",28 rn,PM,Opera/8.93.(Windows 98; Win 9x 4.90; en-US) Pr...,"Fletcher, Richards and Whitaker",3337758169645356,11/18,561,Mastercard,anthony41@reed.com,Drilling engineer,15.160.41.51,fr,70.73
2,Unit 0065 Box 5052\nDPO AP 27450,94 vE,PM,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,"Simpson, Williams and Pham",675957666125,08/19,699,JCB 16 digit,amymiller@morales-harrison.com,Customer service manager,132.207.160.22,de,0.95
3,"7780 Julia Fords\nNew Stacy, WA 45798",36 vm,PM,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0 ...,"Williams, Marshall and Buchanan",6011578504430710,02/24,384,Discover,brent16@olson-robinson.info,Drilling engineer,30.250.74.19,es,78.04
4,"23012 Munoz Drive Suite 337\nNew Cynthia, TX 5...",20 IE,AM,Opera/9.58.(X11; Linux x86_64; it-IT) Presto/2...,"Brown, Watson and Andrews",6011456623207998,10/25,678,Diners Club / Carte Blanche,christopherwright@gmail.com,Fine artist,24.140.33.94,es,77.82


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Address           10000 non-null  object 
 1   Lot               10000 non-null  object 
 2   AM or PM          10000 non-null  object 
 3   Browser Info      10000 non-null  object 
 4   Company           10000 non-null  object 
 5   Credit Card       10000 non-null  int64  
 6   CC Exp Date       10000 non-null  object 
 7   CC Security Code  10000 non-null  int64  
 8   CC Provider       10000 non-null  object 
 9   Email             10000 non-null  object 
 10  Job               10000 non-null  object 
 11  IP Address        10000 non-null  object 
 12  Language          10000 non-null  object 
 13  Purchase Price    10000 non-null  float64
dtypes: float64(1), int64(2), object(11)
memory usage: 1.1+ MB


### Get Browser, OS and Device Info from User Agent
you can use this library to extract information from User Agent Data
https://github.com/selwin/python-user-agents

In [4]:
# !pip install --upgrade pyodbc

In [5]:
# !pip install user_agents

In [6]:
# import user_agents
import user_agents

# save your ua in a string
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
# ua = 'Mozilla/5.0 (iPhone; CPU iPhone OS 5_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3'
# ua = 'BlackBerry9700/5.0.0.862 Profile/MIDP-2.1 Configuration/CLDC-1.1 VendorID/331 UNTRUSTED/1.0 3gpp-gba'
# ua = 'Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'
# ua = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us; Silk/1.1.0-80) AppleWebKit/533.16 (KHTML, like Gecko) Version/5.0 Safari/533.16 Silk-Accelerated=true'

# parse it using .parse 
ua = user_agents.parse(ua)

# print all available attrs
print('Is a bot? ', ua.is_bot)
print('Is mobile? ', ua.is_mobile)
print('Is PC? ',ua.is_pc)
print('Is Tablet? ',ua.is_tablet)
print('Is Touch Capable? ',ua.is_touch_capable)
print('OS Family: ',ua.os.family)
print('OS Version: ',ua.os.version)
print('Browser Family: ',ua.browser.family)
print('Browser Version: ',ua.browser.version)
print('Device Family: ',ua.device.family)
print('Device Brand: ',ua.device.brand)
print('Device Model: ',ua.device.model)

Is a bot?  False
Is mobile?  False
Is PC?  True
Is Tablet?  False
Is Touch Capable?  False
OS Family:  Windows
OS Version:  (10,)
Browser Family:  Chrome
Browser Version:  (90, 0, 4430)
Device Family:  Other
Device Brand:  None
Device Model:  None


## Ok let's implement it on our data

In [7]:
# make 3 fns to extract browser.family, os.family, device.family
def extract_brwoser(ua):
    ua = user_agents.parse(ua)
    return ua.browser.family

def extract_os(ua):
    ua = user_agents.parse(ua)
    return ua.os.family

def extract_device(ua):
    ua = user_agents.parse(ua)
    return ua.device.family

# create 3 new columns
df['OS'] = df['Browser Info'].apply(extract_os)
df['Browser'] = df['Browser Info'].apply(extract_brwoser)
df['Device'] = df['Browser Info'].apply(extract_device)

In [8]:
df.head()

Unnamed: 0,Address,Lot,AM or PM,Browser Info,Company,Credit Card,CC Exp Date,CC Security Code,CC Provider,Email,Job,IP Address,Language,Purchase Price,OS,Browser,Device
0,"16629 Pace Camp Apt. 448\nAlexisborough, NE 77...",46 in,PM,Opera/9.56.(X11; Linux x86_64; sl-SI) Presto/2...,Martinez-Herman,6011929061123406,02/20,900,JCB 16 digit,pdunlap@yahoo.com,"Scientist, product/process development",149.146.147.205,el,98.14,Linux,Opera,Other
1,"9374 Jasmine Spurs Suite 508\nSouth John, TN 8...",28 rn,PM,Opera/8.93.(Windows 98; Win 9x 4.90; en-US) Pr...,"Fletcher, Richards and Whitaker",3337758169645356,11/18,561,Mastercard,anthony41@reed.com,Drilling engineer,15.160.41.51,fr,70.73,Windows,Opera,Other
2,Unit 0065 Box 5052\nDPO AP 27450,94 vE,PM,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,"Simpson, Williams and Pham",675957666125,08/19,699,JCB 16 digit,amymiller@morales-harrison.com,Customer service manager,132.207.160.22,de,0.95,Windows,IE,Other
3,"7780 Julia Fords\nNew Stacy, WA 45798",36 vm,PM,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0 ...,"Williams, Marshall and Buchanan",6011578504430710,02/24,384,Discover,brent16@olson-robinson.info,Drilling engineer,30.250.74.19,es,78.04,Mac OS X,Safari,Mac
4,"23012 Munoz Drive Suite 337\nNew Cynthia, TX 5...",20 IE,AM,Opera/9.58.(X11; Linux x86_64; it-IT) Presto/2...,"Brown, Watson and Andrews",6011456623207998,10/25,678,Diners Club / Carte Blanche,christopherwright@gmail.com,Fine artist,24.140.33.94,es,77.82,Linux,Opera,Other


or u can use this method for better code

In [9]:
# apply 3 fns above in one
def extract_web_data(ua):
    ua = user_agents.parse(ua)
    return pd.Series((ua.os.family, ua.browser.family, ua.device.family))

df[['OS', 'Browser', 'Device']] = df['Browser Info'].apply(extract_web_data)

df.head()

Unnamed: 0,Address,Lot,AM or PM,Browser Info,Company,Credit Card,CC Exp Date,CC Security Code,CC Provider,Email,Job,IP Address,Language,Purchase Price,OS,Browser,Device
0,"16629 Pace Camp Apt. 448\nAlexisborough, NE 77...",46 in,PM,Opera/9.56.(X11; Linux x86_64; sl-SI) Presto/2...,Martinez-Herman,6011929061123406,02/20,900,JCB 16 digit,pdunlap@yahoo.com,"Scientist, product/process development",149.146.147.205,el,98.14,Linux,Opera,Other
1,"9374 Jasmine Spurs Suite 508\nSouth John, TN 8...",28 rn,PM,Opera/8.93.(Windows 98; Win 9x 4.90; en-US) Pr...,"Fletcher, Richards and Whitaker",3337758169645356,11/18,561,Mastercard,anthony41@reed.com,Drilling engineer,15.160.41.51,fr,70.73,Windows,Opera,Other
2,Unit 0065 Box 5052\nDPO AP 27450,94 vE,PM,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,"Simpson, Williams and Pham",675957666125,08/19,699,JCB 16 digit,amymiller@morales-harrison.com,Customer service manager,132.207.160.22,de,0.95,Windows,IE,Other
3,"7780 Julia Fords\nNew Stacy, WA 45798",36 vm,PM,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0 ...,"Williams, Marshall and Buchanan",6011578504430710,02/24,384,Discover,brent16@olson-robinson.info,Drilling engineer,30.250.74.19,es,78.04,Mac OS X,Safari,Mac
4,"23012 Munoz Drive Suite 337\nNew Cynthia, TX 5...",20 IE,AM,Opera/9.58.(X11; Linux x86_64; it-IT) Presto/2...,"Brown, Watson and Andrews",6011456623207998,10/25,678,Diners Club / Carte Blanche,christopherwright@gmail.com,Fine artist,24.140.33.94,es,77.82,Linux,Opera,Other


In [10]:
# plot histogram for the new 3 columns
for col in ['OS', 'Browser', 'Device']:
    fig = px.histogram(df[col])
    fig.show()

In [11]:
# sns.countplot(x='OS', data=df, palette='viridis')

In [12]:
# sns.countplot(x='Browser', data=df, palette='mako')

In [13]:
# sns.countplot(x='Device', data=df, palette='flare')

### Get Location from IP address
you can use this library to extract location information from IP address
https://github.com/tomas-net/ip2geotools

In [14]:
# create new df from the old one df.head(100)
df = df.head(100)
df

Unnamed: 0,Address,Lot,AM or PM,Browser Info,Company,Credit Card,CC Exp Date,CC Security Code,CC Provider,Email,Job,IP Address,Language,Purchase Price,OS,Browser,Device
0,"16629 Pace Camp Apt. 448\nAlexisborough, NE 77...",46 in,PM,Opera/9.56.(X11; Linux x86_64; sl-SI) Presto/2...,Martinez-Herman,6011929061123406,02/20,900,JCB 16 digit,pdunlap@yahoo.com,"Scientist, product/process development",149.146.147.205,el,98.14,Linux,Opera,Other
1,"9374 Jasmine Spurs Suite 508\nSouth John, TN 8...",28 rn,PM,Opera/8.93.(Windows 98; Win 9x 4.90; en-US) Pr...,"Fletcher, Richards and Whitaker",3337758169645356,11/18,561,Mastercard,anthony41@reed.com,Drilling engineer,15.160.41.51,fr,70.73,Windows,Opera,Other
2,Unit 0065 Box 5052\nDPO AP 27450,94 vE,PM,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,"Simpson, Williams and Pham",675957666125,08/19,699,JCB 16 digit,amymiller@morales-harrison.com,Customer service manager,132.207.160.22,de,0.95,Windows,IE,Other
3,"7780 Julia Fords\nNew Stacy, WA 45798",36 vm,PM,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0 ...,"Williams, Marshall and Buchanan",6011578504430710,02/24,384,Discover,brent16@olson-robinson.info,Drilling engineer,30.250.74.19,es,78.04,Mac OS X,Safari,Mac
4,"23012 Munoz Drive Suite 337\nNew Cynthia, TX 5...",20 IE,AM,Opera/9.58.(X11; Linux x86_64; it-IT) Presto/2...,"Brown, Watson and Andrews",6011456623207998,10/25,678,Diners Club / Carte Blanche,christopherwright@gmail.com,Fine artist,24.140.33.94,es,77.82,Linux,Opera,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"4445 Christopher Well Apt. 053\nRonaldton, KS ...",33 Uu,PM,Opera/9.21.(Windows NT 6.1; sl-SI) Presto/2.9....,Moore-Hall,3088798824982514,04/18,928,JCB 16 digit,arellanopatrick@thomas.net,Dealer,59.216.3.55,fr,19.56,Windows,Opera,Other
96,"17923 Carol Lake Suite 232\nNew Andre, MD 0577...",20 bk,AM,Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_8...,Knight LLC,60498172646,09/25,600,Voyager,mannlisa@yahoo.com,Town planner,200.220.84.25,fr,77.51,Mac OS X,Safari,Mac
97,"91474 Craig Brooks Suite 633\nNew Patrick, CT ...",42 Nk,AM,Mozilla/5.0 (X11; Linux x86_64; rv:1.9.7.20) G...,"Parker, Nelson and Delgado",3528362130154694,02/25,366,Mastercard,crawfordwilliam@yahoo.com,Travel agency manager,126.124.175.181,ru,17.20,Linux,Firefox,Other
98,"192 Malone Stream Suite 756\nLake Gary, KS 373...",00 zU,AM,Mozilla/5.0 (compatible; MSIE 7.0; Windows NT ...,Simpson Inc,376660848256225,08/23,215,American Express,jeanetteburgess@hotmail.com,"Engineer, water",177.234.46.245,fr,38.75,Windows,IE,Other


In [15]:
# !pip install ip2geotools

In [16]:
# from ip2geotools.databases.noncommercial import DbIpCity as ip2geo
from ip2geotools.databases.noncommercial import DbIpCity as ip2geo

# get response from the following IP '45.243.72.231' using ip2geo
response = ip2geo.get('45.243.72.231', api_key='free')

# print ip_address, city, region, country, latitude, longitude
print(response.ip_address)
print(response.city)
print(response.region)
print(response.country)
print(response.latitude)
print(response.longitude)

45.243.72.231
Giza
Giza
EG
29.9870753
31.2118063


In [17]:
# create a fn that returns 'country' from the 'ip'
def extract_country_from_ip(ip):
    try:
        response = ip2geo.get(ip, api_key='free')
        return response.country
    except:
        return np.nan
    
# create the column of 'country'    
df['Country'] = df['IP Address'].apply(extract_country_from_ip)

# show head
df.head()

Unnamed: 0,Address,Lot,AM or PM,Browser Info,Company,Credit Card,CC Exp Date,CC Security Code,CC Provider,Email,Job,IP Address,Language,Purchase Price,OS,Browser,Device,Country
0,"16629 Pace Camp Apt. 448\nAlexisborough, NE 77...",46 in,PM,Opera/9.56.(X11; Linux x86_64; sl-SI) Presto/2...,Martinez-Herman,6011929061123406,02/20,900,JCB 16 digit,pdunlap@yahoo.com,"Scientist, product/process development",149.146.147.205,el,98.14,Linux,Opera,Other,NL
1,"9374 Jasmine Spurs Suite 508\nSouth John, TN 8...",28 rn,PM,Opera/8.93.(Windows 98; Win 9x 4.90; en-US) Pr...,"Fletcher, Richards and Whitaker",3337758169645356,11/18,561,Mastercard,anthony41@reed.com,Drilling engineer,15.160.41.51,fr,70.73,Windows,Opera,Other,IT
2,Unit 0065 Box 5052\nDPO AP 27450,94 vE,PM,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,"Simpson, Williams and Pham",675957666125,08/19,699,JCB 16 digit,amymiller@morales-harrison.com,Customer service manager,132.207.160.22,de,0.95,Windows,IE,Other,CA
3,"7780 Julia Fords\nNew Stacy, WA 45798",36 vm,PM,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0 ...,"Williams, Marshall and Buchanan",6011578504430710,02/24,384,Discover,brent16@olson-robinson.info,Drilling engineer,30.250.74.19,es,78.04,Mac OS X,Safari,Mac,US
4,"23012 Munoz Drive Suite 337\nNew Cynthia, TX 5...",20 IE,AM,Opera/9.58.(X11; Linux x86_64; it-IT) Presto/2...,"Brown, Watson and Andrews",6011456623207998,10/25,678,Diners Club / Carte Blanche,christopherwright@gmail.com,Fine artist,24.140.33.94,es,77.82,Linux,Opera,Other,US


In [18]:
# show the value counts of df.country
df['Country'].value_counts()

Country
US    33
ZZ    12
CN     6
JP     5
BR     5
DE     5
GB     4
IT     3
MA     2
ZA     2
MX     2
CA     2
LT     1
KR     1
UA     1
TH     1
PT     1
SE     1
TW     1
IR     1
NL     1
BE     1
FR     1
MU     1
FI     1
CY     1
ES     1
HK     1
IQ     1
NO     1
RO     1
Name: count, dtype: int64

**Ok but we have an issue of the country name is being in alpha2 country code, we need the country name so we use another library: https://pypi.org/project/pycountry/**

In [19]:
# !pip install pycountry

In [20]:
# import pycountry
import pycountry

# get the name of the country from code 'EG'
country = pycountry.countries.get(alpha_2='EG')
country

Country(alpha_2='EG', alpha_3='EGY', flag='🇪🇬', name='Egypt', numeric='818', official_name='Arab Republic of Egypt')

In [21]:
# show the name
country.name

'Egypt'

In [22]:
# create a fn to return the name of the country from the code
def code_to_name(code):
    try:
        return pycountry.countries.get(alpha_2=code).name
    except:
        return np.nan

# apply the fn to 'country' column
df['Country'] = df['Country'].apply(code_to_name)

In [23]:
df['Country'].value_counts()

Country
United States                33
China                         6
Brazil                        5
Germany                       5
Japan                         5
United Kingdom                4
Italy                         3
South Africa                  2
Canada                        2
Morocco                       2
Mexico                        2
Taiwan, Province of China     1
Sweden                        1
Netherlands                   1
Ukraine                       1
Korea, Republic of            1
Lithuania                     1
Thailand                      1
Portugal                      1
Iran, Islamic Republic of     1
Cyprus                        1
Belgium                       1
France                        1
Mauritius                     1
Finland                       1
Spain                         1
Hong Kong                     1
Iraq                          1
Norway                        1
Romania                       1
Name: count, dtype: int64

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Address           100 non-null    object 
 1   Lot               100 non-null    object 
 2   AM or PM          100 non-null    object 
 3   Browser Info      100 non-null    object 
 4   Company           100 non-null    object 
 5   Credit Card       100 non-null    int64  
 6   CC Exp Date       100 non-null    object 
 7   CC Security Code  100 non-null    int64  
 8   CC Provider       100 non-null    object 
 9   Email             100 non-null    object 
 10  Job               100 non-null    object 
 11  IP Address        100 non-null    object 
 12  Language          100 non-null    object 
 13  Purchase Price    100 non-null    float64
 14  OS                100 non-null    object 
 15  Browser           100 non-null    object 
 16  Device            100 non-null    object 
 17

In [25]:
df_countries = df.groupby('Country')['Job'].count().reset_index()
df_countries.rename(columns= {'Job': 'Count'}, inplace= True)
df_countries

Unnamed: 0,Country,Count
0,Belgium,1
1,Brazil,5
2,Canada,2
3,China,6
4,Cyprus,1
5,Finland,1
6,France,1
7,Germany,5
8,Hong Kong,1
9,"Iran, Islamic Republic of",1


In [26]:
px.bar(df_countries, x= 'Count', y= 'Country')

In [27]:
# sns.countplot(y='Country', data=df)

# Great Work!