In [1]:
import os
import joblib
from utils import pretty_trim, simple_split, score_top_preds, get_cmap
from collections import Counter
from scipy.sparse import vstack
import numpy as np
import chardet
from sklearn.metrics import accuracy_score
%matplotlib inline

** Loading trained models **

In [2]:
%%time
filename = 'models_persistence/pickle_models'
(pretty_trim, counter, tfidf, rfe, clfs) = joblib.load(filename)

Wall time: 13.4 s


In [3]:
str_labels = [u'0 บริหารธุรกิจ',
    u'1 ประมง',
    u'2 มนุษยศาสตร์',
    u'3 วนศาสตร์',
    u'4 วิทยาการจัดการ',
    u'5 วิทยาศาสตร์',
    u'6 วิทยาศาสตร์การกีฬา',
    u'7 วิศวกรรมศาสตร์',
    u'8 ศิลปศาสตร์และวิทยาศาสตร์',
    u'9 ศึกษาศาสตร์',
    u'10 ศึกษาศาสตร์และพัฒนศาสตร์',
    u'11 สถาปัตยกรรมศาสตร์',
    u'12 สังคมศาสตร์',
    u'13 สัตวแพทยศาสตร์',
    u'14 สิ่งแวดล้อม',
    u'15 อุตสาหกรรมเกษตร',
    u'16 เกษตร',
    u'17 เศรษฐศาสตร์',
    u'18 โครงการจัดตั้งวิทยาเขตสุพรรณบุรี',
    u'19 โครงการสหวิทยาการระดับบัณฑิตศึกษา']

In [4]:
clf = clfs[0]

** Read segmented docs **

In [5]:
%%time
doc_path = u'./corpus/segmented-journal' # must be a segmented doc path
dataset_contents = []
filename2index = dict()
for i, filename in enumerate(os.listdir(doc_path)):
    path = os.path.join(doc_path, filename)
    filename2index[filename] = i
    with open(path) as f:
        content = f.read()
#         if chardet.detect(content)['encoding'] == 'ascii':
#             continue
        content = content.decode('utf8')
        dataset_contents.append(content)
print 'total files:', len(dataset_contents)

total files: 2165
Wall time: 1min 8s


# Apply learning pipeline to all the docs
First trim

In [6]:
%%time
for i in xrange(len(dataset_contents)):
    dataset_contents[i] = pretty_trim(dataset_contents[i])

Wall time: 5.96 s


Then count words and apply Tf-idf

In [7]:
%time X_new_count = counter.transform(dataset_contents)
%time X_new_tfidf = tfidf.transform(X_new_count)
print X_new_tfidf.shape

Wall time: 15.4 s
Wall time: 181 ms
(2165, 250000)


Remove some features

In [8]:
%time X_new_rfe = rfe.transform(X_new_tfidf)
print X_new_rfe.shape

Wall time: 61 ms
(2165, 20000)


Predict using trained models

In [9]:
y_pred = clf.predict(X_new_rfe)
Counter(y_pred)

Counter({0: 1,
         2: 1,
         3: 1,
         5: 1322,
         7: 15,
         9: 32,
         12: 2,
         15: 192,
         16: 575,
         17: 17,
         19: 7})

In [85]:
X_orig = X_new_rfe

## Synthesize approximated labels using heuristic

In [10]:
# [word -> class_label] mapping dictioanry
approx_label = {
#     "liber": 8,
#     "art": 8,
    "agricultur": 16,
    "agro": 16,
    "educ": 9,
#     "social": 12,
#     "fisheri": 1,
#     "manag": 4,
    "scienc": 5,
    "technolog": 5,
#     "medicin": 13,
#     "pharmaci": 13,
#     "forestri": 3,
#     "forest": 3,
    "engin": 7,
    "econom": 17,
    "architectur": 11,
#     "human": 2,
    "biotechnolog": 5,
#     "environment": 14,
#     "environ": 14,
#     "veterinari": 13,
#     "busi": 0,
#     u"ธุรกิจ": 0,
}

Find approximated labels by searching for the faculty name

In [47]:
def find_heuristic_y(approx_label):
    heuristic_y = np.zeros(len(dataset_contents), dtype=np.int32) - 1 # starts with -1 filled
    for ci in range(len(dataset_contents)):
        words = dataset_contents[ci].split()
        contexts = []
        wis = []
        for wi, word in enumerate(words):
            if u'faculti' in word or u'คณะ' in word:
                context = words[wi-3:wi+5]
                contexts.append(context)
                wis.append(wi)
                for w in context:
                    if w in approx_label:
                        heuristic_y[ci] = approx_label[w]
                        break
            if heuristic_y[ci] != -1:
                break
        if contexts: # logging
            label = str_labels[heuristic_y[ci]] if heuristic_y[ci] != -1 else 'UNKNOWN'
#             print 'Document No.', ci, '(', label, ')'

            for i in range(len(contexts)):
#                 print 'Word No.', wis[i], ' => ',
                for w in contexts[i]:
                    if w in approx_label:
                        w = '[%s]' % w
#                     print w,
#                 print
    return heuristic_y, heuristic_y != -1 # test data that do not have approximated label would be invalid

In [48]:
heuristic_y, valid_mask = find_heuristic_y(approx_label)
print 'Total Label Approximations:', np.count_nonzero(valid_mask)

Total Label Approximations: 1592


### Evaluate accuracy score on approximated labels

In [13]:
print 'Accuracy:', accuracy_score(heuristic_y[valid_mask], y_pred[valid_mask])
print Counter(heuristic_y[valid_mask])
print Counter(y_pred[valid_mask])

Accuracy: 0.564070351759
Counter({5: 845, 16: 462, 7: 133, 9: 89, 17: 56, 11: 7})
Counter({5: 955, 16: 408, 15: 169, 9: 25, 17: 14, 7: 12, 19: 6, 0: 1, 3: 1, 12: 1})


## Mixing documents

In [74]:
np.random.seed(42)
n = len(dataset_contents)
print n
# choose a doc that have heuristic_y only
def choose_random_doc():
    while True:
        doc_id = np.random.choice(n)
        if valid_mask[doc_id]:
            return doc_id

# return 2 distinct documents, (the labels are different)
def choose_distinct_docs():
    while True:
        doc1 = choose_random_doc()
        doc2 = choose_random_doc()
        if heuristic_y[doc1] != heuristic_y[doc2]:
            return doc1, doc2

def segment(doc, wordbegin, wordend):
    return dataset_contents[doc].split()[wordbegin:wordend]
        
# input doc1 and doc2 are indices
def mix_docs(doc1, doc2):
    # TODO: change these magic numbers into something less deterministic
    seg1 = segment(doc1, 200 + np.random.randint(50), 250 + np.random.randint(50))
    seg2 = segment(doc2, 200 + np.random.randint(50), 250 + np.random.randint(50))
    seg3 = segment(doc1, 600 + np.random.randint(50), 650 + np.random.randint(50))
    seg4 = segment(doc2, 600 + np.random.randint(50), 650 + np.random.randint(50))
    mixed = seg1 + seg2 + seg3 + seg4
    return ' '.join(mixed)

def gen_plagiarized_contents(total):
    contents = []
    ys = []
    source_docs = []
    for i in range(total):
        doc1, doc2 = choose_distinct_docs()
        new_content = mix_docs(doc1, doc2)
        new_y = [heuristic_y[doc1], heuristic_y[doc2]]
        contents.append(new_content)
        ys.append(new_y)
        source_docs.append([doc1, doc2])
    return contents, ys, source_docs

def vectorize_contents(contents):
    X_new_count = counter.transform(contents)
    X_new_tfidf = tfidf.transform(X_new_count)
    X_new_rfe = rfe.transform(X_new_tfidf)
    return X_new_rfe

2165


In [75]:
total_gen = 1000
plagiarized_docs, plagiarized_labels, source_docs = gen_plagiarized_contents(total_gen)
X_plagiarized = vectorize_contents(plagiarized_docs)

In [80]:
# showing examples of plagiarized labels for some documents
plagiarized_labels[:10], source_docs[:10], heuristic_y[source_docs[0][0]], heuristic_y[source_docs[0][1]]

([[7, 16],
  [7, 16],
  [5, 17],
  [7, 5],
  [5, 17],
  [5, 16],
  [5, 16],
  [16, 5],
  [5, 16],
  [5, 16]],
 [[1095, 466],
  [1515, 1215],
  [1082, 2047],
  [2068, 600],
  [1129, 1500],
  [815, 455],
  [1076, 791],
  [1367, 1152],
  [200, 1863],
  [1895, 1570]],
 7,
 16)

### Testing accuracy of plagiarism detection
We are finding whether or not any predicted class is within the true labels

In [17]:
probs = clf.predict_proba(X_plagiarized)
# k is number of predicted labels you want
def get_predicted_labels(probs, k):
    # sort the probabilities ascendingly then take the last k probs' indices
    return probs.argsort(axis=1)[:,-k:]

In [18]:
# testing the mechanics of how we interpret valid match
pred_labels = get_predicted_labels(probs, 3)
i = 0 # index of the content we want to test
print plagiarized_labels[i], pred_labels[i], any(y in plagiarized_labels[i] for y in pred_labels[i])

[7, 16] [16 15  5] True


In [19]:
# define a function to find the accuracy given number of predicted labels
def evaluate_accuracy(k):
    pred_labels = get_predicted_labels(probs, k)
    # a list of boolean values showing whether we predict correctly or not
    accurate_predictions = [any(y in plagiarized_labels[i] for y in pred_labels[i]) for i in range(total_gen)]
    return np.mean(accurate_predictions)

In [20]:
print 'Total plagiarized docs:', total_gen
# k can be at most 20 because there are 20 classes
for k in range(1, 11):
    print 'Accuracy given k=%d:' % k, evaluate_accuracy(k)

Total plagiarized docs: 1000
Accuracy given k=1: 0.807
Accuracy given k=2: 0.957
Accuracy given k=3: 0.989
Accuracy given k=4: 0.996
Accuracy given k=5: 1.0
Accuracy given k=6: 1.0
Accuracy given k=7: 1.0
Accuracy given k=8: 1.0
Accuracy given k=9: 1.0
Accuracy given k=10: 1.0


# Clustering

In [38]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import homogeneity_completeness_v_measure
from collections import Counter
import joblib

In [32]:
filename = 'models_persistence/final_dataset'
X_train_final, y_train, X_test_final, y_test = joblib.load(filename)

In [105]:
%%time
n_clusters = len(str_labels)
km = KMeans(n_clusters=n_clusters, max_iter=10, n_init=10)
pred = km.fit_predict(X_train_final)

Wall time: 44.3 s


In [106]:
#%% Clustering Model Evaluation Metrics
homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(y_train, pred)
print '==== Model Evaluation Metrics ===='
print 'Homogeneity:', homogeneity
print 'Completeness:', completeness
print 'V-measure:', v_measure
print 'Predictions:'
for key, value in Counter(pred).iteritems():
    print key, value

==== Model Evaluation Metrics ====
Homogeneity: 0.341512505578
Completeness: 0.350862947214
V-measure: 0.346124588139
Predictions:
0 6
1 81
2 53
3 84
4 258
5 42
6 52
7 39
8 597
9 19
10 186
11 40
12 22
13 63
14 131
15 70
16 170
17 7
18 52
19 67


In [107]:
y_plagiarized_cluster = km.predict(X_plagiarized)
y_orig_cluster = km.predict(X_orig)
Counter(y_plagiarized_cluster), Counter(y_orig_cluster)

(Counter({4: 735, 8: 265}),
 Counter({1: 2, 4: 1941, 7: 1, 8: 198, 10: 7, 12: 5, 13: 3, 14: 4, 19: 4}))

In [116]:
# example: check if the original docs are in the same cluster or not
equal_counts = []
show_num = 20
for i in range(total_gen):
    doc1_cluster = y_orig_cluster[source_docs[i][0]]
    doc2_cluster = y_orig_cluster[source_docs[i][1]]
    equal_count = 0
    equal_count += int(y_plagiarized_cluster[i] == doc1_cluster)
    equal_count += int(y_plagiarized_cluster[i] == doc2_cluster)
    equal_counts.append(equal_count)
    if i < show_num:
        print y_plagiarized_cluster[i], doc1_cluster, doc2_cluster, equal_count
equal_counts = np.array(equal_counts)
print 'Average Equal Counts:', equal_counts.mean()
for i in range(3):
    print 'equal_count=%d counts:' % i, (equal_counts==i).sum()

4 4 4 2
4 4 4 2
8 4 14 0
4 4 4 2
4 4 4 2
4 4 4 2
8 4 4 0
4 4 4 2
4 4 4 2
4 4 4 2
4 4 4 2
8 4 4 0
8 4 8 1
8 4 8 1
4 4 4 2
8 4 4 0
8 4 4 0
4 4 4 2
4 4 4 2
4 4 4 2
Average Equal Counts: 1.558
equal_count=0 counts: 123
equal_count=1 counts: 196
equal_count=2 counts: 681
