## Process and Label Dataset

Here, we take as input a collected `messages.csv`, load it into a DataFrame, and iterate through the users/messages, displaying them (grouped by user) so they can be manually labelled as spam or not spam and output into `dataset.csv`.

In [15]:
import pandas as pd
import numpy as np
from ipywidgets import Button, HBox, VBox
from IPython.display import display, clear_output
from copy import copy

In [14]:
# Read in original CSV
df = pd.read_csv('messages.csv',encoding = 'latin-1', names=['user', 'message'])
# Remove rows with empty user, indicating non-public message, just in case
df = df[df['user'] != '']
# Add boolean spam field
df['spam'] = pd.Series(np.zeros(len(df.index)), dtype=bool)
# Remove all messages that contain nbsp, which are duplicates from re-logging
df = df[~df['message'].str.encode('UTF-8').apply(lambda x: b'\xa0' in x)]
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,user,message,spam
0,im kurupted,G11111,False
1,Arbuckleee,Hurry up before all of the gold is given away !!!,False
2,Marbellizer,"Well hard to say for exact lvls, its just a bi...",False
3,Arbuckleee,"Massive 2.1b giveaway happening right now, sea...",False
4,Marbellizer,Dps usually but if you wanna see exact numers ...,False
5,Arbuckleee,Trade me after posting your nickname on the fo...,False
6,Marbellizer,Spreadsheets,False
7,Arbuckleee,Search Munk23 on Faceb00k to participate in a ...,False
8,Marbellizer,And existing metas for bosses,False
9,xLouis,...,False


In [32]:
list(df.groupby('user'))[0:1]

[('007KingDude',               user         message   spam
  24467  007KingDude  Bond give away  False
  24561  007KingDude  Bond give away  False)]

In [39]:
# This part of the code is disgusting and throws errors at the end but it works well enough without crashing my browser

def spambutton_cb(user, spam):
    def f(_):
        df.loc[df['user'] == user, 'spam'] = spam
    
    return f

def display_user():
    for user, messages in df.groupby('user'):
        notspam = Button(
            description='Not Spam',
            disabled=False,
            button_style='success',
            icon='check'
        )
        spam = Button(
            description='Spam',
            disabled=False,
            button_style='danger',
            icon='times'
        )
        notspam.on_click(spambutton_cb(user, False))
        spam.on_click(spambutton_cb(user, True))

        display(messages.drop_duplicates('message').head(3)[['user', 'message']])
        display(HBox([copy(notspam), copy(spam)]))
        yield

gen = display_user()

def display_user_cluster(_):
    clear_output()
    next_cluster = Button(
            description='Next',
            disabled=False,
            button_style='info',
    )
    next_cluster.on_click(display_user_cluster)
    
    for i in range(10):
        try:
            next(gen)
        except StopIteration:
            return
    display(next_cluster)

display_user_cluster(None)

Unnamed: 0,user,message
28815,zac1,Cispy
28836,zac1,Ok
28901,zac1,Your fake exacly my point


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
11509,zamrkfiesta,Yee
11535,zamrkfiesta,Blue sucks
11639,zamrkfiesta,Lol Xd


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
24478,zanabu,Noty


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
20704,zeuhs1980,Kkkkk


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
21024,zez30,Can any1 spare some gold please so i can train...
21061,zez30,Can any1 spare some gold please


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

Unnamed: 0,user,message
6129,z 1 n k,110k
6133,z 1 n k,110k addy tip


HBox(children=(Button(button_style='success', description='Not Spam', icon='check', style=ButtonStyle()), Butt…

In [40]:
df.to_csv('dataset.csv')

In [7]:
a = "A furyk"
b = "A furyk"
print(a.encode('utf-8'))
print(b.encode('utf-8'))
import unicodedata
unicodedata.normalize('NFKD', a).encode('ascii','ignore')

b'A furyk'
b'A\xc2\xa0furyk'


b'A furyk'

In [27]:
df.head(10)

Unnamed: 0,user,message,spam
0,nadiak5,2.5m,False
1,Knight Askos,A furyk,False
2,HeyBeeeeta,Looking for any generous players kind enough t...,False
3,Begfor11Buns,No,False
4,Granular,Allisun come trouble brewing i need med mory d...,False
5,Knight Askos,Lol,False
6,Begfor11Buns,Lol,False
7,nadiak5,2.5m,False
10,Begfor11Buns,No,False
12,Knight Askos,Lol,False
