# Classifying the Core Developers of Shogun
*Moiz Sajid*

In [144]:
%pylab inline
import scipy.stats
from collections import defaultdict
import copy
import numpy as np
import pandas as pd
import tools as t
import re
import bs4

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [145]:
def parse_string(string): 
    """"
        Parse the input string and tokenize it using regular expressisons:
        First clean the string such that it does not have any punctuation or number, it must only have a-z and A-Z.
        Please note that while doing this, the spaces much not get disturbed, but in case of multiple spaces convert 
        them to one space.
        Then convert the string to lower case and return its words as a list of strings.
        
        Example:
        --------
        Input :  computer scien_tist-s are,,,  the  rock__stars of tomorrow_ <cool>  ????
        Output:  ['computer', 'scientists', 'are', 'the', 'rockstars', 'of', 'tomorrow']
        
        Parameters:
        ----------
        string: string to be parsed...
        re: regular expression to be used for the tokenization.
        
        Returns:
        ---------
        list of tokens extracted from the string...
    """
    #string = string.decode('utf8')
    #new_string = re.sub(u"(?:_|-)", u"", string)
    #output = re.findall(u"[a-zA-Z_*-]+", new_string)
    new_string = re.sub("(?:_|-)", "", string)
    output = re.findall("[a-zA-Z_*-]+", new_string)
    
    return output

In [146]:
def parse_file(filename): # Parse a given file
    """"
        Parameters:
        ----------
        filename: name of text file to be read
   
        
        Returns:
        ---------
        read file as raw string (with \n, \t, \r, etc included)
    """
    
    username = []
    text= []
    
    with open(filename, 'r') as f:
        for line in f:
            #line = line.lstrip()
            if line[0] == '@' and line[:6] != "@sukey":
                line_split = line.split('\t')
                username.append(line_split[0][1:])
                text.append(line_split[1])
                
    return username, text
    
def files_to_strings(X):
    """
        Read an array (or list) of files where each file content is read in a string...
        Input:
        -------
        X an array (or list) of file names
        
        Returns:
        --------
        X as a numpy array with each row containing a read string from the file...
    """ 
    usernames = [] 
    texts = []
    
    for i in X:
        username, text = parse_file(i)
        usernames.extend(username)
        texts.extend(text)
        
    return np.array(usernames), np.array(texts)

In [147]:
class NaiveBayes:
    ''' Implements the Naive Bayes For Text Classification... '''
    def __init__(self, classes):
        self.classes=classes
        self.setv = set()
                    
    def train(self, X, Y):
        ''' Train the multiclass (or Binary) Bayes Rule using the given 
            X [m x d] data matrix and Y labels matrix
            
            Input:
            ------
            X: [m x d] a data matrix of m d-dimensional examples.
            Y: [m x 1] a label vector.
            
            Returns:
            -----------
            Nothing
        '''
        #vocabdict = defaultdict(int)
        setv = set()
        
        self.clabel = np.unique(Y)
        
        self.dictlist = [ defaultdict(int) for x in range(len(self.classes)) ]
        self.prior = np.array( [ 0.0 for x in range(len(self.classes)) ] )
        self.count = np.array( [ 0.0 for x in range(len(self.classes)) ] )
        
        for cidx, class_label in enumerate(self.classes):
            
            class_examples = np.count_nonzero(Y == class_label)
            
            class_prior = class_examples / float( Y.shape[0] )
            
            class_data = X[Y == class_label]
            
            mydict = defaultdict(int)
            
            counter = 0
            
            for doc in class_data:        
                myarr = parse_string(doc[0])
                
                for word in myarr:
                    if word not in setv:
                        setv.add(word)
                    
                    mydict[word] += 1
                    
                counter += len(myarr)
        
            self.prior[cidx] = class_prior
            self.dictlist[cidx] = copy.deepcopy(mydict)
            self.count[cidx] = counter
        
        self.setv = copy.deepcopy(setv)
        self.prior = self.prior[:, np.newaxis]
        self.count = self.count[:, np.newaxis]
            
        
    def test(self, X):
        
        ''' Test the trained classifiers on the given set of examples 
        
                   
            Input:
            ------
            X: [m x d] a data matrix of m d-dimensional test examples.
           
            Returns:
            -----------
                pclass: the predicted class for each example, i.e. to which it belongs
        '''
        nexamples, nfeatures=X.shape
                              
        i = 0
        result = [None] * nexamples
        
        while i < nexamples:
            
            words = parse_string(X[i][0])
            
            tprob = np.zeros((self.clabel.shape[0], 1))
            
            for word in words:
                
                class_prob = np.zeros((self.clabel.shape[0], 1))
                
                for cidx, clabel in enumerate(self.clabel):
                    class_prob[cidx] += np.log10((self.dictlist[cidx][word]+1.0)/(self.count[cidx]+len(self.setv)))
                
                tprob += class_prob
            
            tprob = tprob * self.prior
            result[i] = self.clabel[ tprob.argmax() ]
            #print result[i]
            
            i += 1
            
        return np.array(result)
        
        
    def predict(self, x):
        '''
            Predict the label of given input example...
            
            Input
            ---------
            x: example (list of words)
            
        '''    
        class_prob = np.zeros((self.class_label.shape[0], 1))
        
        for word in x:
            for cidx, clabel in enumerate(self.clabel):
                class_prob[cidx] += np.log10( ( self.dictlist[cidx][word] + 1.0 ) / ( self.count[cidx] + len( self.setv ) ) )

        class_prob = class_prob * np.array(self.prior)

        result = self.clabel[ class_prob.argmax() ]
        
        return np.array(result)

In [148]:
tdir= "./data/"
files=t.get_files(tdir,'*',withpath=True)

In [149]:
usernames, texts = files_to_strings(files)

#usernames = usernames.reshape((usernames.shape[0], 1))
texts = texts.reshape((texts.shape[0], 1))

traindata, trainlabels, testdata, testlabels = t.split_data(texts, usernames)
classes = np.unique(usernames)

trainlabels = trainlabels[:, np.newaxis]
testlabels = testlabels[:, np.newaxis]

In [150]:
print classes

['CoolMcCool' 'Dendemann' 'EinsZwo' 'HeikoS' 'Heikowankenobi' 'STDROCCurve'
 'SWAGGRegation' 'afaikidk' 'besser82' 'besser82_' 'bettyboo' 'bitch'
 'blackburn' 'blyad' 'drwiking' 'ebat' 'gausskern' 'iglesiasg'
 'iglesiasg|afk' 'iglesiasg|shogun' 'iloveponies' 'k8D9' 'knrrrd' 'lambday'
 'lambday_' 'lambdday' 'lisitsyn' 'mlsec' 'more_advertising'
 'natasharomanov' 'no_sgd_regrets_y' 'rieck' 'rock_curve' 'shogun|sonney2k'
 'sonimperator' 'sonne3000' 'sonney2k' 'sonney2k_' 'sonne|fallengod'
 'sonne|god' 'swagg_minimizer' 'swagg_noregrets' 'swagg_regressor'
 'thoralf' 'vowpalkardashian' 'what_heise' 'what_lisitsyn' 'what_switch'
 'wiking']


In [151]:
print traindata.shape, trainlabels.shape 
print testdata.shape, testlabels.shape

(80286, 1) (80286, 1)
(34404, 1) (34404, 1)


In [152]:
print '[Info] training a classifier for following classes {}'.format(classes)
nb=NaiveBayes(classes)
nb.train(traindata,trainlabels)
pclasses=nb.test(testdata)
acc=np.sum(pclasses==testlabels)/float(testlabels.shape[0])
print "[Info] Accuracy = {}".format(acc)

[Info] training a classifier for following classes ['CoolMcCool' 'Dendemann' 'EinsZwo' 'HeikoS' 'Heikowankenobi' 'STDROCCurve'
 'SWAGGRegation' 'afaikidk' 'besser82' 'besser82_' 'bettyboo' 'bitch'
 'blackburn' 'blyad' 'drwiking' 'ebat' 'gausskern' 'iglesiasg'
 'iglesiasg|afk' 'iglesiasg|shogun' 'iloveponies' 'k8D9' 'knrrrd' 'lambday'
 'lambday_' 'lambdday' 'lisitsyn' 'mlsec' 'more_advertising'
 'natasharomanov' 'no_sgd_regrets_y' 'rieck' 'rock_curve' 'shogun|sonney2k'
 'sonimperator' 'sonne3000' 'sonney2k' 'sonney2k_' 'sonne|fallengod'
 'sonne|god' 'swagg_minimizer' 'swagg_noregrets' 'swagg_regressor'
 'thoralf' 'vowpalkardashian' 'what_heise' 'what_lisitsyn' 'what_switch'
 'wiking']
[Info] Accuracy = 0.887106150448
