In [1]:
import colorlover as cl
import numpy as np
import pandas as pd
import pickle

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [2]:
spam_stats = pd.read_pickle("spam.pickle")
livedb_stats = pd.read_pickle("livedb.pickle")

In [3]:
spam_count = sum(spam_stats[0])
livedb_count = sum(livedb_stats[0])
nonspam_count = livedb_count-spam_count
print("Total users: {} Spam users: {} Real users: {}".format(livedb_count, spam_count, nonspam_count))

Total users: 1961609 Spam users: 142689 Real users: 1818920


## User privilege

In [4]:
spam_stats[0]

0    142689
Name: privs, dtype: int64

In [5]:
livedb_stats[0]

0       1961282
1           210
4            42
2            19
1024          8
3             8
1028          8
17            6
33            5
16            2
953           2
57            2
259           2
41            1
32            1
937           1
49            1
9             1
6             1
257           1
289           1
528           1
1025          1
689           1
761           1
25            1
Name: privs, dtype: int64

This could be a reliable indicator of a user not being a spammer however there are very few editors outside of privilege 0 in total so it probably wouldn't make sense to use this in the model.

## User area

In [6]:
spam_stats[1]

NaN         142668
 222.0           6
 5056.0          2
 221.0           2
 11042.0         1
 228.0           1
 10193.0         1
 5065.0          1
 5058.0          1
 7020.0          1
 38.0            1
 9622.0          1
 1178.0          1
 9413.0          1
 57.0            1
Name: area, dtype: int64

In [7]:
livedb_stats[1]

NaN         1925968
 222.0         4239
 221.0         2813
 1178.0        1676
 7020.0         858
 81.0           631
 99.0           577
 73.0           526
 38.0           445
 13.0           370
 7703.0         299
 30.0           294
 11042.0        276
 105.0          265
 100.0          255
 85266.0        226
 194.0          218
 150.0          206
 3925.0         203
 5121.0         187
 266.0          157
 162.0          154
 3912.0         154
 5092.0         151
 3821.0         149
 432.0          149
 5099.0         148
 176.0          141
 84443.0        137
 103.0          135
             ...   
 304.0           66
 5194.0          66
 7279.0          65
 9413.0          65
 5212.0          63
 4563.0          62
 10861.0         61
 5090.0          61
 168.0           61
 87978.0         60
 5065.0          60
 14.0            60
 5089.0          59
 434.0           57
 9622.0          57
 7295.0          56
 72.0            56
 160.0           55
 84.0            55


In [8]:
spam_area_set = 1-(spam_stats[1].loc[np.nan]/spam_count)
nonspam_area_set = 1-((livedb_stats[1]-spam_stats[1]).loc[np.nan]/nonspam_count)
print("Percentage of users with area set:\n  Spam: {:.4%}\n  Real users: {:.4%}".format(spam_area_set, nonspam_area_set))

Percentage of users with area set:
  Spam: 0.0147%
  Real users: 1.9583%


A binary area set/area not set indicator could serve as an additional data point for the model,

## User gender

In [9]:
spam_stats[2]

 2.0    57250
 3.0    56491
NaN     26877
 1.0     2071
Name: gender, dtype: int64

In [10]:
livedb_stats[2]-spam_stats[2]

 1.0     371053
 2.0      67046
 3.0      48622
NaN     1332199
Name: gender, dtype: int64

In [11]:
spam_gender_set = 1-(spam_stats[2].loc[np.nan]/spam_count)
nonspam_gender_set = 1-((livedb_stats[2]-spam_stats[2]).loc[np.nan]/nonspam_count)
print("Percentage of users with gender set:\n  Spam: {:.4%}\n  Real users: {:.4%}".format(spam_gender_set, nonspam_gender_set))

Percentage of users with gender set:
  Spam: 81.1639%
  Real users: 26.7588%


Surprisingly more spam than real users have their gender set, this could also work as a binary indicator.

## Email domains

In [12]:
spam_edomains = spam_stats[3]
print(spam_edomains)

intervisionplc.com          35413
lexxip.com                   8102
joomlaemails.com             8072
babymails.com                7743
freemailhosts.com            7513
lovelymail.info              6266
kingdomheartmail.com         4885
wordpressmails.com           4296
abrighterfutureday.com       4073
hubspotmails.com             3947
trango.co                    3932
devsapps.net                 3548
businesskontakts.com         3323
drupalmails.com              3222
webgarden.com                2845
ibmmails.com                 2502
hotspotmails.com             2423
cfphone.net                  2276
queenofsite.online           2255
uminga.net                   2204
singtelmails.com             2149
temasekmail.com              2115
hypermailbox.com             2107
gloriousfuturedays.com       2039
kingofplace.site             1787
1st-apparelsoftware.info     1704
padita.net                   1568
deepemailbox.com             1532
lolas3.seo-linuxpl.com       1409
justbigbox.com

In [13]:
nonspam_edomains = livedb_stats[3].subtract(spam_stats[3], fill_value=0).astype(int).sort_values(ascending=False)
print(nonspam_edomains)

NaN                         786919
gmail.com                   219115
outlook.com                 171983
yahoo.com                   115053
hotmail.com                  77330
mail.ru                      52383
vikilx.com                    9568
gmx.de                        9201
lexxip.com                    8404
web.de                        7617
aol.com                       5459
free.fr                       4684
gmx.net                       4475
quickenmails.com              4229
hotmail.co.uk                 4042
yahoo.co.uk                   4020
googlemail.com                3918
yahoo.fr                      3716
yandex.com                    3506
babymails.com                 3422
comcast.net                   3310
msn.com                       3099
o2.pl                         2955
mail.com                      2479
yahoo.de                      2424
hotmail.fr                    2408
                              2338
gmx.com                       2204
live.com            


'<' not supported between instances of 'str' and 'float', sort order is undefined for incomparable objects



Perhaps surprisingly, all spam users have an email address set. The domains used by real editors are mostly common mail providers whereas the spam ones are pretty much obscure.

In [14]:
a = spam_edomains.to_frame(name="spam")
b = nonspam_edomains.to_frame(name="nonspam")
c = a.merge(b, how="inner", left_index=True, right_index=True)
print(c)

                           spam  nonspam
intervisionplc.com        35413     1276
lexxip.com                 8102     8404
joomlaemails.com           8072      598
babymails.com              7743     3422
freemailhosts.com          7513     1117
lovelymail.info            6266      864
kingdomheartmail.com       4885      176
wordpressmails.com         4296      740
abrighterfutureday.com     4073      300
hubspotmails.com           3947      835
trango.co                  3932     1283
devsapps.net               3548     1338
businesskontakts.com       3323      233
drupalmails.com            3222      213
webgarden.com              2845      120
ibmmails.com               2502      462
hotspotmails.com           2423      545
cfphone.net                2276      256
queenofsite.online         2255      148
uminga.net                 2204      675
singtelmails.com           2149      337
temasekmail.com            2115      365
hypermailbox.com           2107      296
gloriousfutureda

A surprising amount of "real" editors use obscure domains that are part of the top 100 spam domains, for example the first domain on this list doesn't even have a website, yet over a thousand users use it for emails. Perhaps those are also spam editors who haven't been found out yet?

In [15]:
print("Amount of potential spam editors not yet found: {}".format(sum(c["nonspam"])))

Amount of potential spam editors not yet found: 26356


Even working with just the top 100 email domains, we can see that there are clear differences between the email domains commonly used by real users and those used by spam users and that just by working with the top 100 data we were able to find a lot of potential spam editors not yet reported.

Thus email domains seem to be a pretty solid indicator for detection spam.

## WIP: Time between first login and last login

In [None]:
layout = go.Layout(
    title="Time between first login and last login",
    xaxis=dict(
        title="Time in hours"
    ),
    yaxis=dict(
        title="User count"
    )
)

data = [spam_stats[4]]

iplot(go.Figure(layout=layout, data=data))