## Process and Label Dataset

Here, we take as input a collected `messages.csv`, load it into a DataFrame, and iterate through the users/messages, displaying them (grouped by user) so they can be manually labelled as spam or not spam and output into `dataset.csv`.

In [1]:
import pandas as pd
import numpy as np
from ipywidgets import Button, HBox, VBox
from IPython.display import display, clear_output
from copy import copy

In [2]:
# Read in original CSV
df = pd.read_csv('messagesnew.csv',encoding = 'latin-1', names=['user', 'message', 'logintime', 'time'])
# Remove rows with empty user, indicating non-public message, just in case
df = df[df['user'] != '']
# Add boolean spam field
df['spam'] = pd.Series(np.zeros(len(df.index)), dtype=bool)
# Remove all messages that contain nbsp, which are duplicates from re-logging
df = df[~df['message'].str.encode('UTF-8').apply(lambda x: b'\xa0' in x)]
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,user,message,logintime,time,spam
0,Rangemage11,Can u look for some junk?,0,10,False
1,Deadman12314,Im tryin to find something for them to double,0,10,False
2,Deadman12314,I gave one a cannonball earlier instead of gold,0,30,False
3,Rangemage11,Deadman,0,38,False
4,Deadman12314,Yeah,0,60,False
5,Rangemage11,Give nme some junk?,0,65,False
6,Deadman12314,No sorry,0,72,False
7,Rangemage11,Aww,0,82,False
8,Rangemage11,Can someone give me 100k?,0,110,False
9,Deadman12314,Lol he took my spotted cape,0,111,False


In [3]:
# Tag users by spam/no spam

def spambutton_cb(user, spam):
    def f(_):
        df.loc[df['user'] == user, 'spam'] = spam
    
    return f

def display_user():
    for user, messages in df.groupby('user'):
        notspam = Button(
            description='Not Spam',
            disabled=False,
            button_style='success',
            icon='check'
        )
        spam = Button(
            description='Spam',
            disabled=False,
            button_style='danger',
            icon='times'
        )
        notspam.on_click(spambutton_cb(user, False))
        spam.on_click(spambutton_cb(user, True))

        display(messages.drop_duplicates('message').head(5)[['user', 'message']])
        display(HBox([copy(notspam), copy(spam)]))
        yield

gen = display_user()

def display_user_cluster(_):
    clear_output()
    next_cluster = Button(
            description='Next',
            disabled=False,
            button_style='info',
    )
    next_cluster.on_click(display_user_cluster)
    
    for i in range(10):
        try:
            next(gen)
        except StopIteration:
            return
    display(next_cluster)

display_user_cluster(None)

Unnamed: 0,user,message
1729,zanabu,Lol
1772,zanabu,Selling 4k iron ore


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
5320,zenyzs,Giving away iron mace free


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

In [4]:
df.to_csv('datasetnew.csv')

In [52]:
# Further look at individual spam messages to verify they are actually spam

def spambutton_cb(index, spam):
    def f(_):
        df.iloc[index]['spam'] = spam
    
    return f

def display_msg():
    for index, row in df[df['spam']].drop_duplicates('message').iterrows():
        notspam = Button(
            description='Not Spam',
            disabled=False,
            button_style='success',
            icon='check'
        )
        spam = Button(
            description='Spam',
            disabled=False,
            button_style='danger',
            icon='times'
        )
        notspam.on_click(spambutton_cb(index, False))
        spam.on_click(spambutton_cb(index, True))

        display(row[['user', 'message']])
        display(HBox([copy(notspam), copy(spam)]))
        yield

gen = display_msg()

def display_msg_cluster(_):
    clear_output()
    next_cluster = Button(
            description='Next',
            disabled=False,
            button_style='info',
    )
    next_cluster.on_click(display_msg_cluster)
    
    for i in range(50):
        try:
            next(gen)
        except StopIteration:
            return
    display(next_cluster)

display_msg_cluster(None)

user                                        0gravecover
message    150K paid! Try 500K, now you can trust me :)
Name: 36194, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                        0gravecover
message    225K paid! Try 500K, now you can trust me :)
Name: 36258, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                            Soonpown2808
message    [Soonpown2808] | 52 to 100 | 100K to 246M | Ea...
Name: 36292, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                             0gravecover
message    Tripling money | Legit | 50K min 500K max | 2x...
Name: 36927, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                             0gravecover
message    Tripling money | Legit | 50K min 500K max | 2x...
Name: 36932, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                                RNG 1953
message    Trade accepted for Phorcyz amount:201K [20:11:07]
Name: 36972, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

user                                             RNG 1953
message    Phorcyz has lost with a roll of [6] [20:11:11]
Name: 36974, dtype: object

HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

In [54]:
df[df['user'] == 'littlespice2']

Unnamed: 0,user,message,spam
5124,littlespice2,Doubling gp or items 50mil left,True
5147,littlespice2,Doubling gp or items 50mil left,True
5168,littlespice2,Doubling gp or items 50mil left,True
16779,littlespice2,Doubling gp or items ~567mil left,True
16800,littlespice2,Doubling gp or items ~567mil left,True
16841,littlespice2,Doubling gp or items,True
16943,littlespice2,Doubling gp or items ~367mil left,True
16962,littlespice2,Doubling gp or items ~367mil left,True
27857,littlespice2,Buying ss 400k,True
27873,littlespice2,Buying ss 400k,True
