# Importing Libraries

In [9]:
import os
import re
import numpy as np
import pandas as pd
from pandas import read_excel
from tqdm import tqdm
import pickle
import collections
from collections import Counter

import emoji
import demoji
demoji.download_codes()
import itertools

import spacy
from spacy_cld import LanguageDetector
import contextualSpellCheck

import matplotlib.pyplot as plt

import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings('ignore')

Downloading emoji data ...
... OK (Got response in 0.59 seconds)
Writing emoji data to /home/vageesh/.demoji/codes.json ...
... OK


In [10]:
nlp = spacy.load('en_core_web_sm')
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

# Loading data

Alphabay Dataset

In [11]:
alpha_listing_df = pd.read_csv("../../data/non-anonymous/alphabay/items.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['title', 'vendor', 'first_observed', 'last_observed', 'prediction', 'total_sales', 'ships_to', 'ships_from', 'description'])

In [12]:
alpha_feedback_df = pd.read_csv("../../data/non-anonymous/alphabay/feedbacks.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['reciever', 'message', 'order_title', 'order_amount_usd'])
alpha_feedback_df.columns = ['vendor', 'message', 'title', 'order_amount_usd']

In [13]:
alphabay_df = alpha_listing_df.merge(alpha_feedback_df, how = 'inner', on = ['title', 'vendor'])

In [57]:
alphabay_df.prediction.unique()

array(['Cannabis', 'Stimulants', 'Ecstasy', 'Opioids', 'Benzos',
       'Digital Goods', 'Misc', 'Other', 'Psychedelics', 'Dissociatives',
       'Prescription'], dtype=object)

In [6]:
alphabay_df.to_csv("../../data/non-anonymous/alphabay/alphabay.csv")

In [7]:
alphabay_df.shape

(1771258, 11)

Dream (Valhalla) Dataset

In [14]:
dreams_listing_df = pd.read_csv("../../data/non-anonymous/dream/items.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['title', 'vendor', 'first_observed', 'last_observed', 'prediction', 'total_sales', 'ships_to', 'ships_from', 'description'])

In [15]:
dreams_feedback_df = pd.read_csv("../../data/non-anonymous/dream/feedbacks.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['reciever', 'message', 'order_title', 'order_amount_usd'])
dreams_feedback_df.columns = ['vendor', 'message', 'title', 'order_amount_usd']

In [16]:
dreams_df = dreams_feedback_df.merge(dreams_listing_df, how = 'inner', on = ['title', 'vendor'])

In [11]:
dreams_df.to_csv("../../data/non-anonymous/alphabay/valhalla.csv")

In [12]:
dreams_df.shape

(1816855, 11)

Silk Road

In [9]:
silk_listing_df = pd.read_csv("../../data/non-anonymous/silk-road/items.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['title', 'seller_id', 'category', 'ship_to', 'ship_from', 'listing_description', 'price_btc'])
# silk_listing_df.columns = ['title', 'vendor', 'prediction', 'ships_to', 'ships_from', 'description', 'price_btc']

In [10]:
silk_listing_df.columns = ['title', 'prediction', 'price_btc', 'ships_to', 'ships_from', 'vendor', 'description']

In [11]:
silk_listing_df.shape

(1109155, 7)

In [11]:
silk_feedback_df = pd.read_csv("../../data/non-anonymous/silk-road/feedbacks.csv", error_bad_lines=False, 
                      lineterminator='\n', usecols=['reciever', 'message', 'order_title'])
silk_feedback_df.columns = ['vendor', 'message', 'title']

In [17]:
silk_feedback_df.shape

(10581507, 3)

In [None]:
silk_df = silk_feedback_df.merge(silk_listing_df, how = 'inner', on = ['title', 'vendor'])

# Loading processed datasets to generate insights

In [None]:
import os
if not os.path.exists("../../plots/non-anonymous"):
    os.makedirs("../../plots/non-anonymous")

Alphabay insights

In [None]:
fig = go.Figure(data=[go.Pie(labels=['Cannabis','Stimulants','Ecstasy','Opioids','Digital Goods','Benzos','Misc',
                                    'Other', 'Psychedelics', 'Dissociatives', 'Prescription'],
                             values=[25968,15323,9468,6911,17568,5194,3495,5388,6727,1982,4001])])
# fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20, marker=dict(line=dict(color='#000000', width=2)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alphabay.pdf"), auto_open=False)

Valhalla insights

In [None]:
dream_stats = Counter(dreams_df['prediction'])

In [None]:
fig = go.Figure(data=[go.Pie(labels=list(dream_stats.keys()),
                             values=list(dream_stats.values()))])
# fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20, marker=dict(line=dict(color='#000000', width=2)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "valhalla.pdf"), auto_open=False)

Silk Road

In [21]:
silk_stats = dict(Counter(silk_listing_df['prediction']).most_common(25))

In [22]:
fig = go.Figure(data=[go.Pie(labels=list(silk_stats.keys()),
                             values=list(silk_stats.values()))])
# fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20, marker=dict(line=dict(color='#000000', width=2)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "silk.pdf"), auto_open=False)

'../../plots/non-anonymous/silk.pdf.html'

# Looking into the vendor stats

In [24]:
alpha_vendors = alphabay_df['vendor'].nunique()
dream_vendors = dreams_df['vendor'].nunique()
silk_vendors = silk_listing_df['vendor'].nunique()

In [25]:
x = ["Alphabay", "Dreams", "Silk Board"]
y = [alpha_vendors, dream_vendors, silk_vendors]

In [28]:
fig = go.Figure()
fig.add_bar(x=x,y=y)
fig.update_layout(barmode="relative", xaxis_title="Markets", yaxis_title="No. of Vendors")
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "no_of_vendors.pdf"), auto_open=False)

'../../plots/non-anonymous/no_of_vendors.pdf.html'

# Sybils in Alphabay and Dreams dataset

In [17]:
ad_sybils = set(alphabay_df['vendor'].unique()).intersection(set(dreams_df['vendor'].unique()))
ad_sybils = [str(sybil).lower() for sybil in ad_sybils]
alpha_dreams = pd.concat([alphabay_df, dreams_df], axis=0).drop(columns=['message'])
alpha_dreams.drop_duplicates(inplace=True)

In [18]:
vendor_list = list(alpha_dreams['vendor'])
vendor_list = [str(vendor).lower() for vendor in vendor_list]

for index, vendor in enumerate(vendor_list):
    if vendor in ad_sybils:
        pass
    else:
        vendor_list[index] = 'others'

In [19]:
alpha_dreams_stats = dict(Counter(vendor_list))
del alpha_dreams_stats['others']
alpha_dreams_stats = {k: v for k, v in sorted(alpha_dreams_stats.items(), key=lambda item: item[1], reverse=True)}

In [None]:
fig = go.Figure()
fig.add_bar(x=tuple(alpha_dreams_stats.keys()),y=list(alpha_dreams_stats.values()))
fig.update_layout(barmode="relative", xaxis_title="Vendors", yaxis_title="No. of advertisements", title="Number of Sybils found: " + str(len(ad_sybils)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alpha_dreams_sybils.pdf"), auto_open=False)

# Sybils in Alphabay and Silk Road dataset

In [20]:
alphabay_df_temp = alphabay_df.copy()
alphabay_df_temp.drop(columns=['first_observed', 'last_observed', 'message', 'total_sales'], inplace=True)

In [21]:
silk_listing_df['price_btc'] = silk_listing_df['price_btc'].apply(lambda x : float(x*57.95))
silk_listing_df.rename(columns={"price_btc": "order_amount_usd"}, inplace=True)

NameError: name 'silk_listing_df' is not defined

In [22]:
alpha_silk = pd.concat([alphabay_df_temp, silk_listing_df], axis=0)
alpha_silk.drop_duplicates(inplace=True)

NameError: name 'silk_listing_df' is not defined

In [94]:
as_sybils = set(alphabay_df['vendor'].unique()).intersection(set(silk_listing_df['vendor'].unique()))
as_sybils = [str(sybil).lower() for sybil in as_sybils]

In [95]:
vendor_list = list(alpha_silk['vendor'])
vendor_list = [str(vendor).lower() for vendor in vendor_list]

for index, vendor in enumerate(vendor_list):
    if vendor in sybils:
        pass
    else:
        vendor_list[index] = 'others'

In [96]:
alpha_silk_stats = dict(Counter(vendor_list))
del alpha_silk_stats['others']
alpha_silk_stats = {k: v for k, v in sorted(alpha_silk_stats.items(), key=lambda item: item[1], reverse=True)}

In [97]:
fig = go.Figure()
fig.add_bar(x=tuple(alpha_silk_stats.keys()),y=list(alpha_silk_stats.values()))
fig.update_layout(barmode="relative", xaxis_title="Vendors", yaxis_title="No. of advertisements", title="Number of Sybils found: " + str(len(as_sybils)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alpha_silk_sybils.pdf"), auto_open=False)

'../../plots/non-anonymous/alpha_silk_sybils.pdf.html'

# Sybils in Dreams and Silk Road dataset

In [14]:
dreams_df_temp = dreams_df.copy()
dreams_df_temp.drop(columns=['first_observed', 'last_observed', 'message', 'total_sales'], inplace=True)

In [15]:
dreams_silk = pd.concat([dreams_df_temp, silk_listing_df], axis=0)
dreams_silk.drop_duplicates(inplace=True)

In [19]:
ds_sybils = set(dreams_df['vendor'].unique()).intersection(set(silk_listing_df['vendor'].unique()))
ds_sybils = [str(sybil).lower() for sybil in ds_sybils]

In [100]:
vendor_list = list(dreams_silk['vendor'])
vendor_list = [str(vendor).lower() for vendor in vendor_list]

for index, vendor in enumerate(vendor_list):
    if vendor in sybils:
        pass
    else:
        vendor_list[index] = 'others'

In [101]:
dreams_silk_stats = dict(Counter(vendor_list))
del dreams_silk_stats['others']
dreams_silk_stats = {k: v for k, v in sorted(dreams_silk_stats.items(), key=lambda item: item[1], reverse=True)}

In [103]:
fig = go.Figure()
fig.add_bar(x=tuple(alpha_silk_stats.keys()),y=list(alpha_silk_stats.values()))
fig.update_layout(barmode="relative", xaxis_title="Vendors", yaxis_title="No. of advertisements", title="Number of Sybils found: " + str(len(ds_sybils)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "dreams_silk_sybils.pdf"), auto_open=False)

'../../plots/non-anonymous/dreams_silk_sybils.pdf.html'

# Checking the title and description stats in each of the 3 sybil datasets

Alpha-Dreams dataset

In [17]:
title_len = list(alpha_dreams['title'])
description_len = list(alpha_dreams['description'])

title_len = [str(title).lower().split(" ") for title in title_len]
description_len = [str(description).lower().split(" ") for description in description_len]

In [18]:
title_len = [len(title) for title in title_len]
description_len = [len(description) for description in description_len]

In [None]:
fig = ff.create_distplot([title_len], ['title'], show_hist=False, colors=['slategray'])
fig.update_layout(xaxis_title="Titles", yaxis_title="Length", title="Title distribution with an average length of " + str(np.mean(title_len)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alpha_dream_title.pdf"), auto_open=False)

In [None]:
fig = ff.create_distplot([description_len], ['description'], show_hist=False, colors=['slategray'])
fig.update_layout(xaxis_title="Descriptions", yaxis_title="Length", title="Description distribution with an average length of " + str(np.mean(description_len)))
plotly.offline.plot(fig, filename = os.path.join("../../plots/non-anonymous", "alpha_dream_description.pdf"), auto_open=False)

In [17]:
most_active_user = Counter(alpha_dreams_stats).most_common(1)[0][0]
least_active_user = Counter(alpha_dreams_stats).most_common()[-1][0]
moderately_active_user = 'thecheekygirls1'

Most active user

In [50]:
def clean_data(text):
    text = text.replace("♕",":kingPiece: ").replace("★",":star: ")
    text = text.replace("\r", " \r ").replace("\n", " \n ")
    return text

In [None]:
alphabay_df['vendor'] = alphabay_df['vendor'].apply(lambda x : str(x).lower())
alphabay_df['title'] = alphabay_df['title'].apply(lambda x : str(x).lower())
alphabay_df['description'] = alphabay_df['description'].apply(lambda x : str(x).lower())

In [None]:
most_active_alphabay_df = alphabay_df[alphabay_df['vendor']==most_active_user]
most_active_alphabay_df['title'] = most_active_alphabay_df['title'].apply(lambda x : emoji.demojize(x))
most_active_alphabay_df['description'] = most_active_alphabay_df['description'].apply(lambda x : emoji.demojize(x))
most_active_alphabay_df['title'] = most_active_alphabay_df['title'].apply(lambda x : clean_data(x))
most_active_alphabay_df['description'] = most_active_alphabay_df['description'].apply(lambda x : clean_data(x))

In [52]:
most_active_dreams_df = dreams_df[dreams_df['vendor']==most_active_user]
most_active_dreams_df['title'] = most_active_dreams_df['title'].apply(lambda x : emoji.demojize(x))
most_active_dreams_df['description'] = most_active_dreams_df['description'].apply(lambda x : emoji.demojize(x))
most_active_dreams_df['title'] = most_active_dreams_df['title'].apply(lambda x : clean_data(x))
most_active_dreams_df['description'] = most_active_dreams_df['description'].apply(lambda x : clean_data(x))

In [53]:
most_active_alphabay_df

Unnamed: 0,title,vendor,total_sales,first_observed,last_observed,prediction,ships_to,ships_from,description,message,order_amount_usd


In [49]:
most_active_df

Unnamed: 0,title,vendor,total_sales,first_observed,last_observed,prediction,ships_to,ships_from,description,order_amount_usd
47964,:kingPiece: :kingPiece: :kingPiece: :star: :s...,theshop,2.49,2016-08-01,2016-08-01,Digital Goods,Worldwide,Worldwide,buy a hulu account \rjust for a few bucks!!!!!...,2.49
181336,:kingPiece: :kingPiece: :kingPiece: :star: :s...,theshop,109.78,2016-10-23,2017-03-27,Digital Goods,Worldwide,Worldwide,:kingPiece: :kingPiece: :kingPiece: :star: :s...,4.99
213251,:kingPiece: :kingPiece: :kingPiece: :star: :s...,theshop,4.99,2016-07-24,2016-07-24,Digital Goods,Worldwide,Worldwide,buy a netflix account with the 4screens+ultrah...,4.99
286872,:kingPiece: :kingPiece: :kingPiece: :star: :s...,theshop,35.88,2016-10-15,2016-11-24,Digital Goods,Worldwide,Worldwide,:kingPiece: :kingPiece: :kingPiece: :star: :s...,2.99
358921,:kingPiece: zenmate vpn premium account (auto...,theshop,49.90,2016-12-05,2017-05-12,Digital Goods,Worldwide,Worldwide,:kingPiece: :kingPiece: :kingPiece: :star: :s...,4.99
...,...,...,...,...,...,...,...,...,...,...
1816395,:kingPiece: cvv cashout via btc || credit car...,theshop,2.41,2018-05-07,2018-05-07,Misc,,,(censored due to possible presence of pii),2.41
1816433,:star: cc cashout ebay/amazon || triangulatio...,theshop,2.42,2018-05-13,2018-05-13,Digital Goods,,,(censored due to possible presence of pii),2.42
1816588,":star: socialbomber || earn 1000+ followers, ...",theshop,4.86,2018-06-04,2018-08-08,Digital Goods,,,(censored due to possible presence of pii),2.43
1816614,:kingPiece: cc cashout - the money circle :ki...,theshop,2.42,2018-06-08,2018-06-08,Misc,,,(censored due to possible presence of pii),2.42


In [43]:
emoji.demojize(tweet)

'#startspreadingthenews yankees win great start by :Santa_Claus_medium-dark_skin_tone: going 5strong innings with 5k’s:fire: :ox:\nsolo homerun :volcano::volcano: with 2 solo homeruns and:ogre: 3run homerun… :clown_face: :person_rowing_boat_medium-light_skin_tone: :man_judge_medium_skin_tone: with rbi’s … :fire::fire:\n:Mexico: and :Nicaragua: to close the game:fire::fire:!!!….\nWHAT A GAME!!..\n'

Alpha-Silk dataset

In [14]:
title_len = list(alpha_silk['title'])
description_len = list(alpha_silk['description'])
title_len = [str(title).lower().split(" ") for title in title_len]
description_len = [str(description).lower().split(" ") for description in description_len]
title_len = [len(title) for title in title_len]
description_len = [len(description) for description in description_len]