# Importing Libraries

In [1]:
import os
import re
import numpy as np
import pandas as pd
from pandas import read_excel
from tqdm import tqdm
import pickle
import collections
from collections import Counter

import emoji
import itertools

import spacy
from spacy_cld import LanguageDetector
import contextualSpellCheck

import matplotlib.pyplot as plt

import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings('ignore')

In [2]:
nlp = spacy.load('en_core_web_sm')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

# Loading data

Alphabay Dataset

In [3]:
alpha_listing_df = pd.read_csv("../../data/non-anonymous/alphabay/items.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['title', 'vendor', 'first_observed', 'last_observed', 'prediction', 'total_sales', 'ships_to', 'ships_from', 'description'])

In [None]:
alpha_feedback_df = pd.read_csv("../../data/non-anonymous/alphabay/feedbacks.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['reciever', 'message', 'order_title', 'order_amount_usd'])
alpha_feedback_df.columns = ['vendor', 'message', 'title', 'order_amount_usd']

In [None]:
alphabay_df = alpha_listing_df.merge(alpha_feedback_df, how = 'inner', on = ['title', 'vendor'])

In [None]:
alphabay_df.to_csv("../../data/non-anonymous/alphabay/alphabay.csv")

In [7]:
alphabay_df.shape

(1771258, 11)

Dream (Valhalla) Dataset

In [6]:
dreams_listing_df = pd.read_csv("../../data/non-anonymous/dream/items.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['title', 'vendor', 'first_observed', 'last_observed', 'prediction', 'total_sales', 'ships_to', 'ships_from', 'description'])

In [7]:
dreams_feedback_df = pd.read_csv("../../data/non-anonymous/dream/feedbacks.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['reciever', 'message', 'order_title', 'order_amount_usd'])
dreams_feedback_df.columns = ['vendor', 'message', 'title', 'order_amount_usd']

In [8]:
dreams_df = dreams_feedback_df.merge(dreams_listing_df, how = 'inner', on = ['title', 'vendor'])

In [11]:
dreams_df.to_csv("../../data/non-anonymous/alphabay/valhalla.csv")

In [12]:
dreams_df.shape

(1816855, 11)

Silk Road

In [9]:
silk_listing_df = pd.read_csv("../../data/non-anonymous/silk-road/items.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['title', 'seller_id', 'category', 'ship_to', 'ship_from', 'listing_description', 'price_btc'])
# silk_listing_df.columns = ['title', 'vendor', 'prediction', 'ships_to', 'ships_from', 'description', 'price_btc']

In [10]:
silk_listing_df.columns = ['title', 'prediction', 'price_btc', 'ships_to', 'ships_from', 'vendor', 'description']

In [11]:
silk_listing_df.shape

(1109155, 7)

In [11]:
silk_feedback_df = pd.read_csv("../../data/non-anonymous/silk-road/feedbacks.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['reciever', 'message', 'order_title'])
silk_feedback_df.columns = ['vendor', 'message', 'title']

In [17]:
silk_feedback_df.shape

(10581507, 3)

In [None]:
silk_df = silk_feedback_df.merge(silk_listing_df, how = 'inner', on = ['title', 'vendor'])

# Loading processed datasets to generate insights

In [None]:
import os
if not os.path.exists("../../plots/non-anonymous"):
    os.makedirs("../../plots/non-anonymous")

Alphabay insights

In [None]:
fig = go.Figure(data=[go.Pie(labels=['Cannabis','Stimulants','Ecstasy','Opioids','Digital Goods','Benzos','Misc',
                                    'Other', 'Psychedelics', 'Dissociatives', 'Prescription'],
                             values=[25968,15323,9468,6911,17568,5194,3495,5388,6727,1982,4001])])
# fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20, marker=dict(line=dict(color='#000000', width=2)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alphabay.pdf"), auto_open=False)

Valhalla insights

In [None]:
dream_stats = Counter(dreams_df['prediction'])

In [None]:
fig = go.Figure(data=[go.Pie(labels=list(dream_stats.keys()),
                             values=list(dream_stats.values()))])
# fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20, marker=dict(line=dict(color='#000000', width=2)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "valhalla.pdf"), auto_open=False)

Silk Road

In [21]:
silk_stats = dict(Counter(silk_listing_df['prediction']).most_common(25))

In [22]:
fig = go.Figure(data=[go.Pie(labels=list(silk_stats.keys()),
                             values=list(silk_stats.values()))])
# fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20, marker=dict(line=dict(color='#000000', width=2)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "silk.pdf"), auto_open=False)

'../../plots/non-anonymous/silk.pdf.html'

# Looking into the vendor stats

In [24]:
alpha_vendors = alphabay_df['vendor'].nunique()
dream_vendors = dreams_df['vendor'].nunique()
silk_vendors = silk_listing_df['vendor'].nunique()

In [25]:
x = ["Alphabay", "Dreams", "Silk Board"]
y = [alpha_vendors, dream_vendors, silk_vendors]

In [28]:
fig = go.Figure()
fig.add_bar(x=x,y=y)
fig.update_layout(barmode="relative", xaxis_title="Markets", yaxis_title="No. of Vendors")
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "no_of_vendors.pdf"), auto_open=False)

'../../plots/non-anonymous/no_of_vendors.pdf.html'

# Sybils in Alphabay and Dreams dataset

In [16]:
ad_sybils = set(alphabay_df['vendor'].unique()).intersection(set(dreams_df['vendor'].unique()))
ad_sybils = [str(sybil).lower() for sybil in ad_sybils]
alpha_dreams = pd.concat([alphabay_df, dreams_df], axis=0).drop(columns=['message'])
alpha_dreams.drop_duplicates(inplace=True)

In [None]:
vendor_list = list(alpha_dreams['vendor'])
vendor_list = [str(vendor).lower() for vendor in vendor_list]

for index, vendor in enumerate(vendor_list):
    if vendor in ad_sybils:
        pass
    else:
        vendor_list[index] = 'others'

In [None]:
alpha_dreams_stats = dict(Counter(vendor_list))
del alpha_dreams_stats['others']
alpha_dreams_stats = {k: v for k, v in sorted(alpha_dreams_stats.items(), key=lambda item: item[1], reverse=True)}

In [None]:
fig = go.Figure()
fig.add_bar(x=tuple(alpha_dreams_stats.keys()),y=list(alpha_dreams_stats.values()))
fig.update_layout(barmode="relative", xaxis_title="Vendors", yaxis_title="No. of advertisements", title="Number of Sybils found: " + str(len(ad_sybils)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alpha_dreams_sybils.pdf"), auto_open=False)

# Sybils in Alphabay and Silk Road dataset

In [11]:
alphabay_df_temp = alphabay_df.copy()
alphabay_df_temp.drop(columns=['first_observed', 'last_observed', 'message', 'total_sales'], inplace=True)

In [12]:
silk_listing_df['price_btc'] = silk_listing_df['price_btc'].apply(lambda x : float(x*57.95))
silk_listing_df.rename(columns={"price_btc": "order_amount_usd"}, inplace=True)

In [13]:
alpha_silk = pd.concat([alphabay_df_temp, silk_listing_df], axis=0)
alpha_silk.drop_duplicates(inplace=True)

In [94]:
as_sybils = set(alphabay_df['vendor'].unique()).intersection(set(silk_listing_df['vendor'].unique()))
as_sybils = [str(sybil).lower() for sybil in as_sybils]

In [95]:
vendor_list = list(alpha_silk['vendor'])
vendor_list = [str(vendor).lower() for vendor in vendor_list]

for index, vendor in enumerate(vendor_list):
    if vendor in sybils:
        pass
    else:
        vendor_list[index] = 'others'

In [96]:
alpha_silk_stats = dict(Counter(vendor_list))
del alpha_silk_stats['others']
alpha_silk_stats = {k: v for k, v in sorted(alpha_silk_stats.items(), key=lambda item: item[1], reverse=True)}

In [97]:
fig = go.Figure()
fig.add_bar(x=tuple(alpha_silk_stats.keys()),y=list(alpha_silk_stats.values()))
fig.update_layout(barmode="relative", xaxis_title="Vendors", yaxis_title="No. of advertisements", title="Number of Sybils found: " + str(len(as_sybils)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alpha_silk_sybils.pdf"), auto_open=False)

'../../plots/non-anonymous/alpha_silk_sybils.pdf.html'

# Sybils in Dreams and Silk Road dataset

In [14]:
dreams_df_temp = dreams_df.copy()
dreams_df_temp.drop(columns=['first_observed', 'last_observed', 'message', 'total_sales'], inplace=True)

In [15]:
dreams_silk = pd.concat([dreams_df_temp, silk_listing_df], axis=0)
dreams_silk.drop_duplicates(inplace=True)

In [19]:
ds_sybils = set(dreams_df['vendor'].unique()).intersection(set(silk_listing_df['vendor'].unique()))
ds_sybils = [str(sybil).lower() for sybil in ds_sybils]

In [100]:
vendor_list = list(dreams_silk['vendor'])
vendor_list = [str(vendor).lower() for vendor in vendor_list]

for index, vendor in enumerate(vendor_list):
    if vendor in sybils:
        pass
    else:
        vendor_list[index] = 'others'

In [101]:
dreams_silk_stats = dict(Counter(vendor_list))
del dreams_silk_stats['others']
dreams_silk_stats = {k: v for k, v in sorted(dreams_silk_stats.items(), key=lambda item: item[1], reverse=True)}

In [103]:
fig = go.Figure()
fig.add_bar(x=tuple(alpha_silk_stats.keys()),y=list(alpha_silk_stats.values()))
fig.update_layout(barmode="relative", xaxis_title="Vendors", yaxis_title="No. of advertisements", title="Number of Sybils found: " + str(len(ds_sybils)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "dreams_silk_sybils.pdf"), auto_open=False)

'../../plots/non-anonymous/dreams_silk_sybils.pdf.html'

# Checking the title and description stats in each of the 3 sybil datasets

Alpha-Dreams dataset

In [17]:
title_len = list(alpha_dreams['title'])
description_len = list(alpha_dreams['description'])

title_len = [str(title).lower().split(" ") for title in title_len]
description_len = [str(description).lower().split(" ") for description in description_len]

In [18]:
title_len = [len(title) for title in title_len]
description_len = [len(description) for description in description_len]

In [None]:
fig = ff.create_distplot([title_len], ['title'], show_hist=False, colors=['slategray'])
fig.update_layout(xaxis_title="Titles", yaxis_title="Length", title="Title distribution with an average length of " + str(np.mean(title_len)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alpha_dream_title.pdf"), auto_open=False)

In [None]:
fig = ff.create_distplot([description_len], ['description'], show_hist=False, colors=['slategray'])
fig.update_layout(xaxis_title="Descriptions", yaxis_title="Length", title="Description distribution with an average length of " + str(np.mean(description_len)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alpha_dream_description.pdf"), auto_open=False)

Alpha-Silk dataset

In [14]:
title_len = list(alpha_silk['title'])
description_len = list(alpha_silk['description'])
title_len = [str(title).lower().split(" ") for title in title_len]
description_len = [str(description).lower().split(" ") for description in description_len]
title_len = [len(title) for title in title_len]
description_len = [len(description) for description in description_len]

In [17]:
dreams_silk

Unnamed: 0,vendor,title,order_amount_usd,prediction,ships_to,ships_from,description
0,DreamWeaver,2.5g Jack Herer | UK Vendor | Free UK post,35.3700,Cannabis,Worldwide,United Kingdom,"\nSCANDINAVIAN CUSTOMERS - Please, read out T&..."
1,DreamWeaver,2.5g Jack Herer | UK Vendor | Free UK post,35.3700,Cannabis,Worldwide,United Kingdom,We would like to introduce the newest product ...
10,DreamWeaver,2.5g Jack Herer | UK Vendor | Free UK post,34.9700,Cannabis,Worldwide,United Kingdom,"\nSCANDINAVIAN CUSTOMERS - Please, read out T&..."
11,DreamWeaver,2.5g Jack Herer | UK Vendor | Free UK post,34.9700,Cannabis,Worldwide,United Kingdom,We would like to introduce the newest product ...
14,DreamWeaver,2.5g Jack Herer | UK Vendor | Free UK post,35.2000,Cannabis,Worldwide,United Kingdom,"\nSCANDINAVIAN CUSTOMERS - Please, read out T&..."
...,...,...,...,...,...,...,...
1109146,Halfway Crooks,1G - Strawberry Cheesecake- Next Day Delivery,16.2260,Yubikeys,United Kingdom,United Kingdom,Genetics: Chronic x Cheese Variety: Mostly Ind...
1109147,Space Factory,Speed 5G,66.0630,Yubikeys,Netherlands,Worldwide except USA,â–¡â– â– â– â– â– â–¡â–¡â–¡ â– â– â–¡â–¡â–¡â–¡...
1109149,IceIceIce,Ò³Ì¸Ò²Ì¸Ò³ 1.75g High Quality Chinese Ice/Meth...,494.3135,Yubikeys,Australia,Australia,â–ˆâ–€â–€â–€â–€â–€â–€â–€â–€â–€â–€â–€â–€â–€â–€â...
1109152,namedeclined,â™” Forged Service Dog ID / Fake Guide Dog ID ...,24.9185,Yubikeys,United States of America,Worldwide,This listing is for a forged / fake United Sta...


In [17]:
title_len = [len(title) for title in title_len]


In [18]:
description_len = [len(description) for description in description_len]

TypeError: object of type 'float' has no len()