In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import ast

In [2]:
extracted_from_warcs = pd.read_csv('features_from_warcs.csv')
extracted_from_warcs.head()

Unnamed: 0,email_domains,https_flag,siren,trust_av_flag,trust_flag,trust_sag_flag,trust_ts_flag
0,"['gmail', 'hotmail']",False,512719188,False,False,False,False
1,"['easypets', 'petsafe', 'agecom-vet', 'cano-co...",True,530772565,False,True,False,True
2,[],True,789521945,False,False,False,False
3,"['domain', 'domaine', 'a4', 'recreatec']",True,388336000,False,False,False,False
4,"['email', 'prestashop', 'avis-verifies']",True,519216188,True,True,False,False


In [3]:
extracted_from_warcs.shape

(3626, 7)

In [4]:
extracted_from_warcs.sum()

email_domains     ['gmail', 'hotmail']['easypets', 'petsafe', 'a...
https_flag                                                     1383
siren                                                 2131349177254
trust_av_flag                                                   170
trust_flag                                                      246
trust_sag_flag                                                   21
trust_ts_flag                                                    56
dtype: object

# Clean email domains

Approach: create a feature of whether or not a company has an own email domain:
* If no email domains were found: missing value
* If only recurring email domains were found: no own email domain (False)
* If a unique email domain name is found, own email domain (True)

## Find recurring email domains

In [5]:
email_domains_inventory = []

for email_domain_list in extracted_from_warcs['email_domains']:
    email_domain_list = ast.literal_eval(email_domain_list)
    for email_domain in email_domain_list:
        email_domains_inventory.append(email_domain)    

In [6]:
counter = Counter(email_domains_inventory)
email_domains_freq = pd.DataFrame.from_dict(counter, orient='index', columns=['frequency']).reset_index()
email_domains_freq.sort_values(by='frequency', ascending=False).head()

Unnamed: 0,index,frequency
11,prestashop,656
10,email,446
6,domain,214
0,gmail,211
7,domaine,199


In [7]:
recurring_email_domains = email_domains_freq[email_domains_freq['frequency']>1]['index'].tolist()
print(recurring_email_domains)

['gmail', 'hotmail', 'domain', 'domaine', 'email', 'prestashop', 'avis-verifies', 'yahoo', 'free', 'bbox', 'laposte', 'e-mail', 'wanadoo', '4', 'ovh', '2x', 'opencart', 'orange', 'monuments-nationaux', 'ville-nice', 'outlook', 'neuf', 'me', 'skynet', 'live', 'club-internet', 'example', '2', 'aol', 'addresshere', '2x1', 'thelia', 'patworx', 'pascalevenot', 'aliceadsl', 'tawk', 'ddlx', 'sfr', 'demo', 'mail', 'exemple', 'simplyabox', 'yourcompany', 'negostice', 'gmx', 'www', 'numericable', 'mangopay', 'cssf', 'noos', 'x2', '2x-300x300', 'promobile']


## Cleaning

In [8]:
features_from_warcs = extracted_from_warcs.copy()

In [9]:
own_email_domain_all = []
for i in range(extracted_from_warcs.shape[0]):
    siren = extracted_from_warcs.iloc[i,2]
    email_domains = extracted_from_warcs.iloc[i,0]
    email_domains = ast.literal_eval(email_domains)
    
    own_email_domain = 0
    for email_domain in email_domains:
        if email_domain not in recurring_email_domains:
            own_email_domain = 1
            
    own_email_domain_all.append(own_email_domain) 

features_from_warcs['own_email_domain'] = own_email_domain_all
features_from_warcs.head()

Unnamed: 0,email_domains,https_flag,siren,trust_av_flag,trust_flag,trust_sag_flag,trust_ts_flag,own_email_domain
0,"['gmail', 'hotmail']",False,512719188,False,False,False,False,0
1,"['easypets', 'petsafe', 'agecom-vet', 'cano-co...",True,530772565,False,True,False,True,1
2,[],True,789521945,False,False,False,False,0
3,"['domain', 'domaine', 'a4', 'recreatec']",True,388336000,False,False,False,False,1
4,"['email', 'prestashop', 'avis-verifies']",True,519216188,True,True,False,False,0


# Clean rest

In [10]:
features_from_warcs['https_flag'] = features_from_warcs['https_flag'].apply(lambda x: int(x))
features_from_warcs['trust_av_flag'] = features_from_warcs['trust_av_flag'].apply(lambda x: int(x))
features_from_warcs['trust_flag'] = features_from_warcs['trust_flag'].apply(lambda x: int(x))
features_from_warcs['trust_sag_flag'] = features_from_warcs['trust_sag_flag'].apply(lambda x: int(x))
features_from_warcs['trust_ts_flag'] = features_from_warcs['trust_ts_flag'].apply(lambda x: int(x))
features_from_warcs = features_from_warcs[['siren', 'https_flag', 'own_email_domain', 'trust_av_flag', 'trust_sag_flag', 'trust_ts_flag', 'trust_flag']]
features_from_warcs.head()

Unnamed: 0,siren,https_flag,own_email_domain,trust_av_flag,trust_sag_flag,trust_ts_flag,trust_flag
0,512719188,0,0,0,0,0,0
1,530772565,1,1,0,0,1,1
2,789521945,1,0,0,0,0,0
3,388336000,1,1,0,0,0,0
4,519216188,1,0,1,0,0,1


In [11]:
features_from_warcs.shape

(3626, 7)

# Filter by SIREN

In [16]:
sirens = np.load('/project/0_cleaning/output_cleaning/sirens.npy').astype(int)
to_keep = []
for siren in features_from_warcs['siren']:
    keep = False
    if siren in sirens:
        keep = True
    to_keep.append(keep)
    
features_from_warcs = features_from_warcs[to_keep]
features_from_warcs.shape

(3573, 7)

# Save features

In [17]:
print(features_from_warcs.shape)
print(len(features_from_warcs['siren'].unique()))

(3573, 7)
3573


In [14]:
features_from_warcs.to_csv('/project/1_feature_extraction/output_feature_extraction/features_from_warcs.csv', index=False)

In [15]:
features_from_warcs.sum()

siren               2103535968876
https_flag                   1363
own_email_domain             1600
trust_av_flag                 170
trust_sag_flag                 21
trust_ts_flag                  54
trust_flag                    244
dtype: int64