In [2]:
# %% Import Libraries
from pathlib import Path

import os
import sys
import pickle
from tqdm import tqdm
import itertools
import argparse
import collections
from collections import Counter, defaultdict
from multiprocessing import Pool

import textdistance

import numpy as np
import pandas as pd

import plotly
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# %% Loading data
print("Loading Data ....")
alpha_items_df = pd.read_csv("../data/non-anonymous/alphabay/items.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['marketplace', 'title', 'vendor', 'prediction', 'ships_to', 'ships_from', 'description'])
alpha_feedback_df = pd.read_csv("../data/non-anonymous/alphabay/feedbacks.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['reciever', 'order_title', 'order_amount_usd'])
alpha_feedback_df.columns = ['vendor', 'title', 'order_amount_usd']
alpha_df = alpha_items_df.merge(alpha_feedback_df, how = 'inner', on = ['title', 'vendor']).drop_duplicates()

dreams_items_df = pd.read_csv("../data/non-anonymous/dream/items.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['marketplace', 'title', 'vendor', 'prediction', 'ships_to', 'ships_from', 'description'])
dreams_feedback_df = pd.read_csv("../data/non-anonymous/dream/feedbacks.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['reciever', 'order_title', 'order_amount_usd'])
dreams_feedback_df.columns = ['vendor', 'title', 'order_amount_usd']
dreams_df = dreams_items_df.merge(dreams_feedback_df, how = 'inner', on = ['title', 'vendor']).drop_duplicates()
silk_df = pd.read_csv("../data/non-anonymous/silk-road/items.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['marketplace', 'title', 'seller_id', 'category', 'ship_to', 'ship_from', 'listing_description', 'price_btc']).drop_duplicates()
silk_df.columns = ['marketplace' ,'title', 'prediction', 'order_amount_usd', 'ships_to', 'ships_from', 'vendor', 'description']
silk_df['order_amount_usd'] = silk_df['order_amount_usd'].apply(lambda x: x*54.46)

df_dict = {"alpha":alpha_df, "silk":silk_df, "dreams":dreams_df}

Loading Data ....


In [None]:
silk_vendors = list(silk_df.vendor.unique())
alpha_vendors = list(alpha_df.vendor.unique())
dreams_vendors = list(dreams_df.vendor.unique())
all_vendors = set(silk_vendors + dreams_vendors + alpha_vendors)

In [None]:
import time
def seq_distance(sequence_comb):
    start = time.time()
    sequence1, sequence2 = sequence_comb
    sequence1 = str(sequence1)
    sequence2 = str(sequence2)
    # Longest common subsequence similarity
    dist1 = textdistance.lcsseq.normalized_similarity(sequence1, sequence2)
    # Longest common substring similarity
    dist2 = textdistance.lcsstr.normalized_similarity(sequence1, sequence2)
    # Ratcliff-Obershelp similarity
    dist3 = textdistance.ratcliff_obershelp.normalized_similarity(sequence1, sequence2)
    return round((dist1 + dist2 + dist3)/3, 4)

In [None]:
def compute_distance_within_market(text_list1, text_list2, mode='within'):
    if mode == 'between':
        all_combinations = [list(zip(each_permutation, text_list2)) for each_permutation in itertools.permutations(text_list1, len(text_list2))]
        all_combinations = [item for sublist in all_combinations for item in sublist]
        all_combinations = set([value for value in all_combinations if value[0]!=value[1]])
    elif mode == 'within':
        all_combinations = [list(x) for x in itertools.combinations(text_list1, 2)]
    p = Pool()
    sequence_distance = p.map(seq_distance, all_combinations)
    p.close()
    return sequence_distance

def extract_title_description(df):
    title_text = list(df['title'])
    title_text = ["Title : " + str(title) for title in title_text]
    description_text = list(df['description'])
    description_text = ["Description : " + str(description) for description in description_text]
    return [title_text[i] + ' \n ' + description_text[i] for i in range(df.shape[0])]

In [None]:
def calculate_distance_between_shared_vendors():
    alpha_df['vendor'] = alpha_df['vendor'].apply(lambda x : str(x).lower())
    dreams_df['vendor'] = dreams_df['vendor'].apply(lambda x : str(x).lower())
    silk_df['vendor'] = silk_df['vendor'].apply(lambda x : str(x).lower())

    alpha_vendors = list(alpha_df.vendor.unique())
    dreams_vendors = list(dreams_df.vendor.unique())
    silk_vendors = list(silk_df.vendor.unique())
    shared_vendors = set(alpha_vendors) & set(dreams_vendors) & set(silk_vendors)

    alpha_shared = alpha_df[alpha_df['vendor'].isin(shared_vendors)]
    dreams_shared = dreams_df[dreams_df['vendor'].isin(shared_vendors)]
    silk_shared = silk_df[silk_df['vendor'].isin(shared_vendors)]

    df = pd.concat([alpha_shared, dreams_shared, silk_shared])
    df['marketplace'] = df['marketplace'].apply(lambda x : str(x).lower())
    
    alpha_vendors = list(df[df['marketplace']=='alphabay']['vendor'].unique())
    valhalla_vendors = list(df[df['marketplace']=='valhalla']['vendor'].unique())
    dreams_vendors = list(df[df['marketplace']=='dream']['vendor'].unique())
    berlusconi_vendors = list(df[df['marketplace']=='berlusconi']['vendor'].unique())
    traderoute_vendors = list(df[df['marketplace']=='traderoute']['vendor'].unique())
    silk_vendors = list(df[df['marketplace']=='silk road 1']['vendor'].unique())

    vendor_market = {}
    adv_count_dict = dict(Counter(df['vendor']))
    adv_count_dict = {k:v for k,v in adv_count_dict.items() if v<=50}
    
    for vendor in adv_count_dict:
        temp_list = [] 
        if vendor in alpha_vendors:
            temp_list.append('alphabay')
        if vendor in valhalla_vendors:
            temp_list.append('valhalla')
        if vendor in dreams_vendors:
            temp_list.append('dreams')
        if vendor in berlusconi_vendors:
            temp_list.append('berlusconi')
        if vendor in traderoute_vendors:
            temp_list.append('traderoute')
        if vendor in silk_vendors:
            temp_list.append('silk road 1')
        else:
            pass
        vendor_market[vendor] = temp_list
        
    for vendor, markets in vendor_market.items():
        all_comb = list(itertools.combinations(markets, 2)) + [(markets[i], markets[i]) for i in range(len(markets))]
        vendor_dict = {}
        for market1, market2 in all_comb:
            temp_df = df[df['vendor']==vendor]
            temp_df1 = temp_df[temp_df['marketplace']==market1]
            temp_df2 = temp_df[temp_df['marketplace']==market2]
            if temp_df1.shape[0] + temp_df2.shape[0] > 2:
                text_list1 = extract_title_description(temp_df1)
                text_list2 = extract_title_description(temp_df2)
                if market1 != market2:
                    sequence_distance = compute_distance_within_market(text_list1, text_list2, mode='between')
                else:
                    sequence_distance = compute_distance_within_market(text_list1, text_list1)
            else:
                sequence_distance = np.array([-1.0])
                
            vendor_dict[(vendor, market1, market2)] = np.mean(sequence_distance)

            print("#############################################")
            print(vendor, market1, market2)
            print(np.mean(sequence_distance))

In [None]:
calculate_distance_between_shared_vendors()

lcsseq :  0.4
lcsstr :  0.08947368421052626
ratcliff_obershelp :  0.360128617363344
Total time taken :lcsseq :   0.068608999252319340.3162393162393162lcsseq : 
 
0.28125
lcsstr : lcsseq :  lcsseq : 0.07264957264957261 
lcsseq :   0.16575591985428050.1686274509803921

lcsstr : 0.17277486910994766ratcliff_obershelp :  0.3605633802816901
 lcsseq : lcsstr : 
 Total time taken :0.05902777777777779 0.143292682926829280.030965391621129323
lcsstr : 
 0.1844182014465332
lcsseq :  0.033333333333333326

 0.16968698517298186
lcsstr :  0.05061082024432806
ratcliff_obershelp :  ratcliff_obershelp : ratcliff_obershelp : 0.20298507462686566  0.31295843520782396lcsstr : 
0.20919175911251986 Total time taken :0.03130148270181221 
ratcliff_obershelp : 
 0.25405120849609375
0.26224783861671463
Total time taken :
Total time taken :  0.261646747589111330.26139259338378906

Total time taken : ratcliff_obershelp : 0.26187729835510254 lcsstr :  0.2719780219780219

0.02591463414634143Total time taken : 0.316827

In [63]:
dict_a = dict(Counter(df['vendor']))


In [14]:
df[['marketplace', 'vendor']]

Unnamed: 0,marketplace,vendor
849,Alphabay,thecorporation
1136,Alphabay,frankmatthews
1893,Alphabay,angelina
1982,Alphabay,ozconnection
2650,Alphabay,grandwizardslair
...,...,...
1109095,Silk Road 1,mrcronk
1109101,Silk Road 1,maligan
1109125,Silk Road 1,dimercurio
1109138,Silk Road 1,medsforyou


In [15]:
df.marketplace.unique()

array(['Alphabay', 'Valhalla', 'Dream', 'Berlusconi', 'Traderoute',
       'Silk Road 1'], dtype=object)

In [20]:
alpha_vendors = list(df[df['marketplace']=='Alphabay']['vendor'].unique())

In [21]:
alpha_vendors

array(['thecorporation', 'frankmatthews', 'angelina', 'ozconnection',
       'grandwizardslair', 'indianpilldaddy', 'bionik', 'threekings',
       'aussiegear', 'letswork', 'dankmedgradekush', 'b1g1mpact',
       'lindalovelace', 'kittenhuffer', 'peaceandpot', 'domesticdoode',
       'gotmilk', 'kobrivoje', 'theemeraldtriangle', 'felixuk',
       'megrimlock', 'mrcronk', 'blueviking', 'dimercurio', 'sayno2drugs',
       'swisshound78', 'koptevo', 'sildenafil', 'canjam420',
       'puffinbilly', 'candyshop', 'utopic', 'mahakala', 'dutchquality',
       'drshrooms', 'utopia', 'rabbithole', 'carlos lopez', 'medsforyou',
       'fatsam', 'maligan', 'digitalpunk', 'p3nd8s', 'righteous',
       'meerkovo', 'whiteyford', 'scidmt', 'hackthegibson',
       'nzt48givesyouwings', 'biocanna', 'premiumgear', 'hanfhenk',
       'mariosgramshoppe', 'namedeclined', 'revenantchild',
       'thecollective', 'deemzbeemz', 'mushroomgirl', 'cyanspore'],
      dtype=object)