## Process and Label Dataset

Here, we take as input a collected `messages.csv`, load it into a DataFrame, and iterate through the users/messages, displaying them (grouped by user) so they can be manually labelled as spam or not spam and output into `dataset.csv`.

In [15]:
import pandas as pd
import numpy as np
from ipywidgets import Button, HBox, VBox
from IPython.display import display, clear_output
from copy import copy

In [14]:
# Read in original CSV
df = pd.read_csv('messages.csv',encoding = 'latin-1', names=['user', 'message'])
# Remove rows with empty user, indicating non-public message, just in case
df = df[df['user'] != '']
# Add boolean spam field
df['spam'] = pd.Series(np.zeros(len(df.index)), dtype=bool)
# Remove all messages that contain nbsp, which are duplicates from re-logging
df = df[~df['message'].str.encode('UTF-8').apply(lambda x: b'\xa0' in x)]
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,user,message,spam
0,im kurupted,G11111,False
1,Arbuckleee,Hurry up before all of the gold is given away !!!,False
2,Marbellizer,"Well hard to say for exact lvls, its just a bi...",False
3,Arbuckleee,"Massive 2.1b giveaway happening right now, sea...",False
4,Marbellizer,Dps usually but if you wanna see exact numers ...,False
5,Arbuckleee,Trade me after posting your nickname on the fo...,False
6,Marbellizer,Spreadsheets,False
7,Arbuckleee,Search Munk23 on Faceb00k to participate in a ...,False
8,Marbellizer,And existing metas for bosses,False
9,xLouis,...,False


In [39]:
# Tag users by spam/no spam

def spambutton_cb(user, spam):
    def f(_):
        df.loc[df['user'] == user, 'spam'] = spam
    
    return f

def display_user():
    for user, messages in df.groupby('user'):
        notspam = Button(
            description='Not Spam',
            disabled=False,
            button_style='success',
            icon='check'
        )
        spam = Button(
            description='Spam',
            disabled=False,
            button_style='danger',
            icon='times'
        )
        notspam.on_click(spambutton_cb(user, False))
        spam.on_click(spambutton_cb(user, True))

        display(messages.drop_duplicates('message').head(5)[['user', 'message']])
        display(HBox([copy(notspam), copy(spam)]))
        yield

gen = display_user()

def display_user_cluster(_):
    clear_output()
    next_cluster = Button(
            description='Next',
            disabled=False,
            button_style='info',
    )
    next_cluster.on_click(display_user_cluster)
    
    for i in range(10):
        try:
            next(gen)
        except StopIteration:
            return
    display(next_cluster)

display_user_cluster(None)

Unnamed: 0,user,message
28815,zac1,Cispy
28836,zac1,Ok
28901,zac1,Your fake exacly my point


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
11509,zamrkfiesta,Yee
11535,zamrkfiesta,Blue sucks
11639,zamrkfiesta,Lol Xd


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
24478,zanabu,Noty


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
20704,zeuhs1980,Kkkkk


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
21024,zez30,Can any1 spare some gold please so i can train...
21061,zez30,Can any1 spare some gold please


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
6129,z 1 n k,110k
6133,z 1 n k,110k addy tip


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

In [53]:
df.to_csv('dataset.csv')

In [52]:
# Further look at individual spam messages to verify they are actually spam

def spambutton_cb(index, spam):
    def f(_):
        df.iloc[index]['spam'] = spam
    
    return f

def display_msg():
    for index, row in df[df['spam']].drop_duplicates('message').iterrows():
        notspam = Button(
            description='Not Spam',
            disabled=False,
            button_style='success',
            icon='check'
        )
        spam = Button(
            description='Spam',
            disabled=False,
            button_style='danger',
            icon='times'
        )
        notspam.on_click(spambutton_cb(index, False))
        spam.on_click(spambutton_cb(index, True))

        display(row[['user', 'message']])
        display(HBox([copy(notspam), copy(spam)]))
        yield

gen = display_msg()

def display_msg_cluster(_):
    clear_output()
    next_cluster = Button(
            description='Next',
            disabled=False,
            button_style='info',
    )
    next_cluster.on_click(display_msg_cluster)
    
    for i in range(50):
        try:
            next(gen)
        except StopIteration:
            return
    display(next_cluster)

display_msg_cluster(None)

user                                        0gravecover
message    150K paid! Try 500K, now you can trust me :)
Name: 36194, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                        0gravecover
message    225K paid! Try 500K, now you can trust me :)
Name: 36258, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                            Soonpown2808
message    [Soonpown2808] | 52 to 100 | 100K to 246M | Ea...
Name: 36292, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                             0gravecover
message    Tripling money | Legit | 50K min 500K max | 2x...
Name: 36927, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                             0gravecover
message    Tripling money | Legit | 50K min 500K max | 2x...
Name: 36932, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                                RNG 1953
message    Trade accepted for Phorcyz amount:201K [20:11:07]
Name: 36972, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                             RNG 1953
message    Phorcyz has lost with a roll of [6] [20:11:11]
Name: 36974, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

In [43]:
for message in df.iterrows():
    print(message)

(0, user       im kurupted
message         G11111
spam             False
Name: 0, dtype: object)
(1, user                                              Arbuckleee
message    Hurry up before all of the gold is given away !!!
spam                                                    True
Name: 1, dtype: object)
(2, user                                             Marbellizer
message    Well hard to say for exact lvls, its just a bi...
spam                                                   False
Name: 2, dtype: object)
(3, user                                              Arbuckleee
message    Massive 2.1b giveaway happening right now, sea...
spam                                                    True
Name: 3, dtype: object)
(4, user                                             Marbellizer
message    Dps usually but if you wanna see exact numers ...
spam                                                   False
Name: 4, dtype: object)
(5, user                                              Arbuc

(232, user                                            57 Fire 1542
message     Join  Sr Swap , RUNESCAPE's most TR...
spam                                                    True
Name: 232, dtype: object)
(233, user                                            57 Fire 1542
message     Want to TRANSFER your RS3/07/DMM gp...
spam                                                    True
Name: 233, dtype: object)
(234, user                                            57 Fire 1542
message     Join  Sr Swap , RUNESCAPE's most TR...
spam                                                    True
Name: 234, dtype: object)
(235, user       ThyrtySeven
message           Jags
spam             False
Name: 235, dtype: object)
(236, user                                            57 Fire 1542
message     Join  Sr Swap , RUNESCAPE's most TR...
spam                                                    True
Name: 236, dtype: object)
(237, user                             

Name: 467, dtype: object)
(468, user       ImTheCosmos
message           Okay
spam             False
Name: 468, dtype: object)
(469, user                                            Rushpearl680
message     Want to TRANSFER your RS3/07/DMM gp...
spam                                                    True
Name: 469, dtype: object)
(470, user       ImTheCosmos
message           Why?
spam             False
Name: 470, dtype: object)
(471, user       Popular Goat
message       For loand
spam              False
Name: 471, dtype: object)
(472, user       Popular Goat
message           Loans
spam              False
Name: 472, dtype: object)
(473, user                                            Rushpearl680
message     Join  Sr Swap , RUNESCAPE's most TR...
spam                                                    True
Name: 473, dtype: object)
(474, user       ImTheCosmos
message    For a loan?
spam             False
Name: 474, dtype: object)
(475, user       Popular Goat
mes

Name: 713, dtype: object)
(714, user       70sacred1270
message        How mach
spam              False
Name: 714, dtype: object)
(715, user       Deus Aridam
message            Omg
spam             False
Name: 715, dtype: object)
(716, user        Deus Aridam
message    Tytytytytyty
spam              False
Name: 716, dtype: object)
(717, user       Yasoma2
message         Np
spam         False
Name: 717, dtype: object)
(718, user           ADIINDA
message    I neeed god
spam             False
Name: 718, dtype: object)
(719, user       ADIINDA
message       Gold
spam         False
Name: 719, dtype: object)
(720, user                    Yasoma2
message    Gave im my7 last 50m
spam                      False
Name: 720, dtype: object)
(721, user                                                fkntring
message    Giving away a few mill first come first serve ...
spam                                                   False
Name: 721, dtype: object)
(722, user                                 

Name: 878, dtype: object)
(879, user       70sacred1270
message        Ned gold
spam              False
Name: 879, dtype: object)
(880, user                                                RNG 5049
message    You only need to trade once! The current queue...
spam                                                    True
Name: 880, dtype: object)
(881, user       70sacred1270
message        Ned gold
spam              False
Name: 881, dtype: object)
(882, user                                                fkntring
message    Giving away a few mill first come first serve ...
spam                                                   False
Name: 882, dtype: object)
(883, user       naefyo
message        0k
spam        False
Name: 883, dtype: object)
(884, user                                                RNG 5049
message    You only need to trade once! The current queue...
spam                                                    True
Name: 884, dtype: object)
(885, user       70sacred1270
messa

Name: 1070, dtype: object)
(1071, user       70sacred1270
message       Take junk
spam              False
Name: 1071, dtype: object)
(1072, user                                            Rushpearl680
message     Want to TRANSFER your RS3/07/DMM gp...
spam                                                    True
Name: 1072, dtype: object)
(1073, user       70sacred1270
message       Take junk
spam              False
Name: 1073, dtype: object)
(1074, user                                                fkntring
message    Giving away a few mill first come first serve ...
spam                                                   False
Name: 1074, dtype: object)
(1075, user                                            Rushpearl680
message     Join  Sr Swap , RUNESCAPE's most TR...
spam                                                    True
Name: 1075, dtype: object)
(1076, user       70sacred1270
message       Take junk
spam              False
Name: 1076, dtype: object)
(107

Name: 1271, dtype: object)
(1272, user                       calisse7
message    Cant you just double it?
spam                          False
Name: 1272, dtype: object)
(1273, user                                                 iwDukew
message    "Grandrolling" Cc | Dice Game & Trusted Ranks ...
spam                                                    True
Name: 1273, dtype: object)
(1274, user               Valex Sombra
message    Cheaper alternative?
spam                      False
Name: 1274, dtype: object)
(1275, user         PistolSalad
message    Is it ur all?
spam               False
Name: 1275, dtype: object)
(1276, user                                            Huntriyl1212
message    R s m a l l s, cih 0m   se11  g01d & acc  1000...
spam                                                    True
Name: 1276, dtype: object)
(1277, user                                 at 14
message    Doubling gold or items 749m lft
spam                                 False
Name: 1277, dtype: obj

(1436, user                                 Join Teh Cc
message    Official 4Chan Clan Chat '07 Lads' Cc
spam                                        True
Name: 1436, dtype: object)
(1437, user                                               RYELLKING
message     Join  Sr Swap , RUNESCAPE's most TR...
spam                                                    True
Name: 1437, dtype: object)
(1438, user          calisse7
message    Gimme my 4m
spam             False
Name: 1438, dtype: object)
(1439, user                                                 iwDukew
message    "Grandrolling" Cc | Dice Game & Trusted Ranks ...
spam                                                    True
Name: 1439, dtype: object)
(1440, user                            Join Teh Cc
message    Weebs Welcome '"07 Lads" Cc  uwu
spam                                   True
Name: 1440, dtype: object)
(1441, user                                               RYELLKING
message     Want to SWAP your RS3/07/ DMM 

Name: 1688, dtype: object)
(1689, user                       Pleblord3000
message    That is a lot of cabbages :D
spam                              False
Name: 1689, dtype: object)
(1690, user                                                X2 Naive
message    [Automated][54-100]X2 [100K-250M] @X2 Naive 19...
spam                                                    True
Name: 1690, dtype: object)
(1691, user                                                X2 Naive
message    [Automated][54-100]X2 [100K-250M] @X2 Naive 19...
spam                                                    True
Name: 1691, dtype: object)
(1692, user       JoeBigCalves
message      Seems fair
spam              False
Name: 1692, dtype: object)
(1693, user                                                X2 Naive
message    [Automated][54-100]X2 [100K-250M] @X2 Naive 19...
spam                                                    True
Name: 1693, dtype: object)
(1694, user                                            Fablefr

Name: 1879, dtype: object)
(1880, user       JoeBigCalves
message         Rate it
spam              False
Name: 1880, dtype: object)
(1881, user                                            Fablefrog169
message    R s m a l l s, cih 0m   se11  g01d & acc  1000...
spam                                                    True
Name: 1881, dtype: object)
(1882, user                                               RNG 5006
message    Trade accepted for Thinkr amount:100K [20:09:06]
spam                                                   True
Name: 1882, dtype: object)
(1883, user                                             RNG 5006
message    Thinkr has lost with a roll of [12] [20:09:10]
spam                                                 True
Name: 1883, dtype: object)
(1884, user       Thinkr
message       Uck
spam        False
Name: 1884, dtype: object)
(1885, user          Thinkr
message    I give uo
spam           False
Name: 1885, dtype: object)
(1886, user                                

Name: 1934, dtype: object)
(1935, user                                            57 Fire 1542
message     Join  Sr Swap , RUNESCAPE's most TR...
spam                                                    True
Name: 1935, dtype: object)
(1936, user                                   Thinkr
message    Dont clean yourself this time lool
spam                                    False
Name: 1936, dtype: object)
(1937, user                   AlexDreams
message    Lol gambles it away...
spam                        False
Name: 1937, dtype: object)
(1938, user                                            57 Fire 1542
message     Want to TRANSFER your RS3/07/DMM gp...
spam                                                    True
Name: 1938, dtype: object)
(1939, user       Thinkr
message    Ahhaha
spam        False
Name: 1939, dtype: object)
(1940, user       AlexDreams
message     Ima alch 
spam            False
Name: 1940, dtype: object)
(1941, user             Thinkr
message    G

Name: 2130, dtype: object)
(2131, user                                                X2 Radar
message    [Automated][54-100]X2 [100K-250M] @X2 Radar 21...
spam                                                    True
Name: 2131, dtype: object)
(2132, user                                                X2 Radar
message    [Automated][54-100]X2 [100K-250M] @X2 Radar 21...
spam                                                    True
Name: 2132, dtype: object)
(2133, user                                                X2 Radar
message    [Automated][54-100]X2 [100K-250M] @X2 Radar 21...
spam                                                    True
Name: 2133, dtype: object)
(2134, user                                                X2 Radar
message    [Automated][54-100]X2 [100K-250M] @X2 Radar 21...
spam                                                    True
Name: 2134, dtype: object)
(2135, user                                                X2 Radar
message    [Automated][54-100]X2 [100

Name: 2369, dtype: object)
(2370, user                                            Walktheif538
message     Want to TRANSFER your RS3/07/DMM gp...
spam                                                    True
Name: 2370, dtype: object)
(2371, user                      Mathiiasbk
message    Buying 5k ruby bolts 1,7m
spam                           False
Name: 2371, dtype: object)
(2372, user                                                naomi551
message    Biggest Drop Parties Daily/Giveaways & cracker...
spam                                                    True
Name: 2372, dtype: object)
(2373, user                                           Zyra LolOp
message    Join -Swaps Best- Cc To Swap Your 07/Rs3 Gold!
spam                                                 True
Name: 2373, dtype: object)
(2374, user                        Die2374
message    I dont see a message tho
spam                          False
Name: 2374, dtype: object)
(2375, user                      Mathiiasbk


KeyboardInterrupt: 