In [1]:
import nltk
import string
import os
import sys
import numpy as np

from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn import svm
from multiprocessing import Pool, cpu_count, active_children
from time import sleep, time

In [2]:
train_path = 'GeneralData/A/train'
test_path = 'GeneralData/A/test'
train_files = []
test_files = []
new_dict = dict()
stemmer = PorterStemmer()
#chat_corpus = [['uid','msg']]

In [3]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    #stems = stem_tokens(tokens, stemmer)    ## <-- (optional ?)
    return tokens

In [4]:
def data_extractor(file):
    chat_corpus = []
    tmp_dict = dict()

    f = open(file)
    soup = BeautifulSoup(f, 'xml')

    # data extraction
    all_post = soup.findAll('POST')
    for post in all_post:
        tmp = []
        #if post.USERNAME.string:
        #    tmp.append(post.USERNAME.string)
        if post.BODY.string:
            tmp.append(post.BODY.string)
            chat_corpus.append(tmp)
            
    # data to string 
    text = ''
            
    for chats in chat_corpus:
        #print("chats : ----")
        #print(chats)
        for logs in chats:
            tmp = ''
            #print("logs : ----")
            #print(logs)
            for pcs in logs:
                tmp = tmp + pcs
                tmp = tmp + " : "
            text = text + tmp + " \n "
                    
    lowers = text.lower()
    no_punctuation = lowers.translate(string.punctuation)
    tmp_dict[file] = no_punctuation
    return tmp_dict

def get_filenames(path):
    list_of_files = []
    for subdir, dirs, files in os.walk(path):
        for file in files:
            file_path = subdir + os.path.sep + file
            list_of_files.append(file_path)
    return list_of_files

def cb(data):
    if data:
        global new_dict
        new_dict.update(data)

In [5]:
#if __name__ == '__main__':

In [6]:
pool = Pool(processes=cpu_count()*4, maxtasksperchild=25)
    
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
 
train_files = get_filenames(train_path)
test_files = get_filenames(test_path)

### Block 1

print("Processing training data...")
then = time()
for file in train_files:
    pool.apply_async(data_extractor, (file,), callback=cb)
pool.close()
while len(active_children()) > 1:
    sleep(0.5)
pool.join()
now = time()
print(" [*] Finished processing training data. Time taken : %0.2f sec" % (now - then))

### Block 2

print(" [*] Feature Enginnering...")
then = time()
train_tfidf = tfidf.fit_transform(new_dict.values())
now = time()
print(" [*] Completed. Time taken : %0.2f sec" % (now - then))
new_dict.clear()

  
pool = Pool(processes=cpu_count()*4, maxtasksperchild=25)

### Block 3

print("Processing testing data...")
then = time()
for file in test_files:
    pool.apply_async(data_extractor, (file,), callback=cb)
pool.close()
while len(active_children()) > 1:
    sleep(0.5)
pool.join()
now = time()
print(" [*] Finished processing testing data. Time taken : %0.2f sec" % (now - then))
    
### Block 4

print(" [*] Feature Engineering...")
then = time()
test_tfidf = tfidf.transform(new_dict.values())
now = time()
print(" [*] Completed. Time taken : %0.2f sec" % (now - then))
new_dict.clear()


Processing training data...
 [*] Finished processing training data. Time taken : 2.53 sec
 [*] Feature Enginnering...
 [*] Completed. Time taken : 3.72 sec
Processing testing data...
 [*] Finished processing testing data. Time taken : 3.02 sec
 [*] Feature Engineering...
 [*] Completed. Time taken : 4.22 sec


In [7]:
### Block 5

print("Starting classifier...")
print("Using OneClassSVM")
clf = svm.OneClassSVM(kernel='poly', degree=5, random_state=0)
print(" [*] Training the classifier...")
then = time()
clf.fit(train_tfidf)
now = time()
print(" [*] Training completed. Time taken : %0.6f sec" % (now - then))
print("Prediciting results for training dataset...")
y_pred_train = clf.predict(train_tfidf)
print("Prediciting results for test dataset...")
y_pred_test = clf.predict(test_tfidf)
print("Completed.")
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size

Starting classifier...
Using OneClassSVM
 [*] Training the classifier...
 [*] Training completed. Time taken : 0.003159 sec
Prediciting results for training dataset...
Prediciting results for test dataset...
Completed.


In [8]:
print("Train accuracy: %0.2f" % (100-((n_error_train/y_pred_train.size)*100)), "%")
print("Test accuracy : %0.2f" % (100-((n_error_test/y_pred_test.size)*100)), "%")

Train accuracy: 54.05 %
Test accuracy : 57.89 %


### Accuracy Results
----
#### Kernel : poly
 
  | Deg | Tr | Te |
  |:---:|:--:|:--:|
  | 1 | 27.0% | 31.5% |
  | 2 | 37.8% | 36.8% |
  | 3 | " | " |
  | 4 | 35.1% | " |
  | 5 | " | " |
  | 6 | " | " |
  | 7 | 37.8% | 36.8% |
  | 8 | " | " |
  | 9 | " | " |
  | 10 | 35.1 | 36.8 |
  | 11 | " | " |
  | 12 | 37.8 | 36.8 |
  | 24 | 18.9 | 21.0 |
  | 25 | 94.5 | 94.7 |
  | 26 | 100 | 100 |

<1x62 sparse matrix of type '<class 'numpy.float64'>'
	with 49 stored elements in Compressed Sparse Row format>

In [63]:
from sklearn.cross_validation import StratifiedKFold

def class_report(conf_mat):
    tp, fp, fn, tn = conf_mat.flatten()
    measures = {}
    measures['accuracy'] = (tp + tn) / (tp + fp + fn + tn)
    measures['specificity'] = tn / (tn + fp)        # (true negative rate)
    measures['sensitivity'] = tp / (tp + fn)        # (recall, true pos rate)
    measures['precision'] = tp / (tp + fp)
    measures['f1score'] = 2*tp / (2*tp + fp + fn)
    return measures

def analyze_model(model=None, folds=10):
    ''' Run x-validation and return scores, averaged confusion matrix,
        and df with false positives and negatives '''

    X, y, X_test = load()
    y = y.values   # to numpy
    X = X.values
    if not model:
        model = load_model()

    # Manual x-validation to accumulate actual
    cv_skf = StratifiedKFold(y, n_folds=folds, shuffle=False, random_state=0)
    scores = []
    conf_mat = np.zeros((2, 2))      # Binary classification
    false_pos = Set()
    false_neg = Set()

    for train_i, val_i in cv_skf:
        X_train, X_val = X[train_i], X[val_i]
        y_train, y_val = y[train_i], y[val_i]

        print "Fitting fold..."
        model.fit(X_train, y_train)

        print "Predicting fold..."
        y_pprobs = model.predict_proba(X_val)       # Predicted probabilities
        y_plabs = np.squeeze(model.predict(X_val))  # Predicted class labels

        scores.append(roc_auc_score(y_val, y_pprobs[:, 1]))
        confusion = confusion_matrix(y_val, y_plabs)
        conf_mat += confusion

        # Collect indices of false positive and negatives
        fp_i = np.where((y_plabs==1) & (y_val==0))[0]
        fn_i = np.where((y_plabs==0) & (y_val==1))[0]
        false_pos.update(val_i[fp_i])
        false_neg.update(val_i[fn_i])

        print "Fold score: ", scores[-1]
        print "Fold CM: \n", confusion

    print "\nMean score: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2)
    conf_mat /= folds
    print "Mean CM: \n", conf_mat
    print "\nMean classification measures: \n"
    print(class_report(conf_mat))
    return scores, conf_mat, {'fp': sorted(false_pos), 'fn': sorted(false_neg)}

<bound method csr_matrix.tocsr of <37x62 sparse matrix of type '<class 'numpy.float64'>'
	with 1667 stored elements in Compressed Sparse Row format>>

## Accuracy Results [Testing dataset = 25% Training dataset]
-----

### *  (w/o stemming)

 | # | Training | Testing |
 |:-:|:--------:|:-------:|
 | 1 | 83.9% | 71.4% | **<-- Best**
 | 2 | 75.0% | 64.2% |
 | 3 | 75.0% | 64.2% |
 | 4 | 64.2% | 64.2% |
 | 5 | 66.1% | 64.2% |
 | 6 | 75.0% | 71.4% |
 | **Mean** | **73.2%** | **66.6%** |

### *  (w/ stemming)
 | # | Training | Testing |
 |:-:|:--------:|:-------:|
 | 1 | 76.78% | 71.42% |
 | 2 | 62.50% | 57.14% |
 | 3 | 58.93% | 64.28% | 
 | 4 | 60.71% | 57.14% |
 | 5 | 64.28% | 64.28% |
 | 6 | 80.35% | 71.43% | **<-- Best**
 | **Mean** | **67.25%** | **64.28%** |
 
 ### *  (w/o stemming & training with only predator chats)
 | # | Training | Testing |
 |:-:|:--------:|:-------:|
 | 1 | 67.8% | 85.7% |
 | 2 | 60.7% | 78.5% |
 | 3 | 71.4% | 92.8% |
 | 4 | 76.7% | 92.8% | **<-- Best**
 | 5 | 75.0% | 92.8% |
 | 6 | 71.4% | 92.8% |
 | **Mean** | **70.5%** | **89.23%**|


## Accuracy Results [3 fold cross-validation]
-----

 ### *  (w/o stemming & training with only predator chats)
 | # | Training | Testing |
 |:-:|:--------:|:-------:|
 | 1 | 13.5% | 36.8% |
 | 2 | 24.3% | 47.3% |
 | 3 | 59.4% | 78.9% | **<-- Best**
 | **Mean** | **32.4%** | **54.3%** |
 
 
 | # | Training | Testing |
 |:-:|:--------:|:-------:|
 | 1 | 48.6% | 57.8% |
 | 2 | 67.5% | 89.4% | **<-- Best**
 | 3 | 56.7% | 63.1% |
 | **Mean** | **57.6%** | **70.1%** |
 
 
 | # | Training | Testing |
 |:-:|:--------:|:-------:|
 | 1 | 63.2% | 77.7% |
 | 2 | 78.9% | 83.3% | **<--Best**
 | 3 | 73.6% | 77.7% |
 | **Mean** | **71.9%** | **79.5%**|
