In [1]:
import json
from warcio.archiveiterator import ArchiveIterator
import os
import time
import pandas as pd
import tqdm

# Find ecommerce per wapp directory

In [2]:
paths = [f'/var/www/wapp-0{i}/' for i in range(10)]
paths

['/var/www/wapp-00/',
 '/var/www/wapp-01/',
 '/var/www/wapp-02/',
 '/var/www/wapp-03/',
 '/var/www/wapp-04/',
 '/var/www/wapp-05/',
 '/var/www/wapp-06/',
 '/var/www/wapp-07/',
 '/var/www/wapp-08/',
 '/var/www/wapp-09/']

In [3]:
# Relevant apps scraped from https://www.wappalyzer.com/datasets
ecom_apps = ['3dCart', '91App', 'Afosto', 'AfterBuy', 'Arastta', 'Avangate', 'BigBangShop', 'Bigcommerce', 'Bigware', 'Bizweb', 'Blesta', 'Botble CMS', 'Clientexec', 'CloudCart', 'ColorMeShop', 'Comandia', 'Combeenation', 'Commerce Server', 'Cosmoshop', 'Craft Commerce', 'CS Cart', 'CubeCart', 'Drupal Commerce', 'Dynamicweb', 'EC-CUBE', 'Elcodi', 'EPages', 'eZ Publish', 'Fastcommerce', 'Fbits', 'Fortune3', 'Future Shop', 'FWP', 'Gambio', 'Haravan', 'Hinza Advanced CMS', 'Hybris', 'IBM WebSphere Commerce', 'iCongo', 'Ideasoft', 'IdoSell Shop', 'Intershop', 'INTI', 'iPresta', 'JET Enterprise', 'Jetshop', 'JTL Shop', 'Kajabi', 'Kamva', 'Klarna Checkout', 'KobiMaster', 'Lightspeed eCom', 'Magento', 'MakeShopKorea', 'Melis CMS V2', 'Mietshop', 'Modified', 'Moguta.CMS', 'Mondo Media', 'MYPAGE Platform', 'NEO - Omnichannel Commerce Platform', 'Neto', 'Netsuite', 'nopCommerce', 'Odoo', 'Open Classifieds', 'Open eShop', 'OpenCart', 'Oracle Commerce', 'Oracle Commerce Cloud', 'osCommerce', 'osCSS', 'OXID eShop', 'Pimcore', 'Plentymarkets', 'Powergap', 'PrestaShop', 'Projesoft', 'Proximis Omnichannel', 'Proximis Web to Store', 'Quick.Cart', 'Rakuten DBCore', 'Rakuten Digital Commerce', 'RBS Change', 'Robin', 'Rocket', 'Salesforce Commerce Cloud', 'Sazito', 'Shopatron', 'Shopcada', 'Shoper', 'shoperfa', 'Shopery', 'Shopfa', 'Shopify', 'Shopline', 'Shoptet', 'Shopware', 'Smartstore', 'SoftTr', 'Solusquare OmniCommerce Cloud', 'Spree', 'Store Systems', 'Storeden', 'Strato', 'Textalk', 'Thelia', 'Ticimax', 'Tictail', 'TomatoCart', 'TotalCode', 'Tray', 'Ubercart', 'UltraCart', 'Venda', 'vibecommerce', 'VirtueMart', 'Volusion (V1)', 'Volusion (V2)', 'VP-ASP', 'VTEX', 'VTEX Integrated Store', 'Websale', 'WEBXPAY', 'WHMCS', 'Wikinggruppen', 'WooCommerce', 'Woosa', 'X-Cart', 'Xanario', 'Xonic', 'xtCommerce', 'Yahoo! Ecommerce', 'Zen Cart', 'Zeuscart']
print(ecom_apps)

['3dCart', '91App', 'Afosto', 'AfterBuy', 'Arastta', 'Avangate', 'BigBangShop', 'Bigcommerce', 'Bigware', 'Bizweb', 'Blesta', 'Botble CMS', 'Clientexec', 'CloudCart', 'ColorMeShop', 'Comandia', 'Combeenation', 'Commerce Server', 'Cosmoshop', 'Craft Commerce', 'CS Cart', 'CubeCart', 'Drupal Commerce', 'Dynamicweb', 'EC-CUBE', 'Elcodi', 'EPages', 'eZ Publish', 'Fastcommerce', 'Fbits', 'Fortune3', 'Future Shop', 'FWP', 'Gambio', 'Haravan', 'Hinza Advanced CMS', 'Hybris', 'IBM WebSphere Commerce', 'iCongo', 'Ideasoft', 'IdoSell Shop', 'Intershop', 'INTI', 'iPresta', 'JET Enterprise', 'Jetshop', 'JTL Shop', 'Kajabi', 'Kamva', 'Klarna Checkout', 'KobiMaster', 'Lightspeed eCom', 'Magento', 'MakeShopKorea', 'Melis CMS V2', 'Mietshop', 'Modified', 'Moguta.CMS', 'Mondo Media', 'MYPAGE Platform', 'NEO - Omnichannel Commerce Platform', 'Neto', 'Netsuite', 'nopCommerce', 'Odoo', 'Open Classifieds', 'Open eShop', 'OpenCart', 'Oracle Commerce', 'Oracle Commerce Cloud', 'osCommerce', 'osCSS', 'OXID eS

In [4]:
def extract_wapps(path):

    wapps_per_warc = {}
    
    # Extract wapps used by all crawls in path
    for i, file in enumerate(os.listdir(path)):

        wapps = []
        Json = open(path + file)

        try:
            json_loaded = json.load(Json)
        except:
            wapps_per_warc[file] = [] # some json are empty
        else:
            for j in range(len(json_loaded['applications'])):
                wapps.append(json_loaded['applications'][j]['name'])
            wapps_per_warc[file] = wapps
        finally:
            Json.close()
        
    return wapps_per_warc

In [5]:
def extract_ecom(dict_wapps, ecom_apps):
    
    ecommerce = []
    wapps = []
    
    # Subset crawls resp. the wapps they use to crawls using ecommerce wapps resp. the ecommerce wapps they use
    for key, values in dict_wapps.items():
        wapps_for_key = []
        for value in values:
            if value in ecom_apps:
                wapps_for_key.append(value)
                if key not in ecommerce:
                    ecommerce.append(key)
        if wapps_for_key:
            wapps.append(wapps_for_key)
    
    return ecommerce, wapps

In [6]:
def save_csv(ecommerce, wapps, iteration):
    
    # Save result to csv
    df = pd.DataFrame()
    df['ecommerce'] = ecommerce
    df['wapp_folder'] = f'wapp-0{iteration}'
    df['wapps'] = wapps
    df.to_csv(f'ecom-wapp-0{iteration}.csv', index=False)

In [7]:
for iteration, path in tqdm.tqdm(enumerate(paths)):
    
    dict_wapps = extract_wapps(path)
    ecommerce, wapps = extract_ecom(dict_wapps, ecom_apps)
    save_csv(ecommerce, wapps, iteration)

10it [40:52, 164.13s/it]


In [8]:
df = pd.read_csv('ecom-wapp-00.csv')
df.head()

Unnamed: 0,ecommerce,wapp_folder,wapps
0,CC-MAIN-20171119004302-20171119024302-00601.wa...,wapp-00,['PrestaShop']
1,CC-MAIN-20171120032907-20171120052907-00562.wa...,wapp-00,['OpenCart']
2,CC-MAIN-20171124180349-20171124200349-00561.wa...,wapp-00,['Magento']
3,CC-MAIN-20171123050243-20171123070243-00112.wa...,wapp-00,['OpenCart']
4,CC-MAIN-20171117223659-20171118003659-00491.wa...,wapp-00,['Magento']


# Join ecommerces into one csv

In [9]:
for i in range(1,10):
    df_to_concat = pd.read_csv(f'ecom-wapp-0{i}.csv')
    df = pd.concat([df,df_to_concat], ignore_index=True)
df.to_csv('/project/0_cleaning/output_cleaning/ecom-full.csv', index=False)

In [10]:
df.shape

(605987, 3)