In [15]:
from collections import Counter
from sklearn import naive_bayes
import sst
import os
from random import shuffle
import numpy as np
import vsm
import pandas as pd
import tensorflow as tf
import utils
from nltk.util import bigrams
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [11]:
data_home = 'process_results'

ming = []
tmp = []
with open('./process_results/results_mingchao', 'r') as f:
    for line in f:
        line = line.strip()
        if line == '':
            ming.append(tmp)
            tmp = []
        else:
            tmp.append(line)
qin = []
with open('./process_results/results_preqin', 'r') as f:
    for line in f:
        line = line.strip()
        if line == '':
            qin.append(tmp)
            tmp = []
        else:
            tmp.append(line)

## Dataset Splitting

In [12]:
print(len(ming))

14045


In [13]:
print(len(qin))

7388


In [16]:
shuffle(ming)
ming = ming[:7388]

In [18]:
ming_train = ming[:6000]
qin_train = qin[:6000]
ming_test = ming[6000:]
qin_test = qin[6000:]

## Unigram Word Count Stats

In [20]:
from collections import defaultdict

chars = defaultdict(int)
for par in ming_train:
    for sent in par:
        for char in sent:
            if char != ' ': chars[char] += 1
for par in qin_train:
    for sent in par:
        for char in sent:
            if char != ' ': chars[char] += 1

In [40]:
charslist = []
for key, value in sorted(chars.items(), key=lambda pair: pair[1]):
    charslist.append(key)
common_chars = charslist[-1000:] #can change this

In [41]:
common_chars[-10:]

['道', '曰', '一', '了', '而', '人', '子', '也', '不', '之']

In [44]:
def unigrams(para):
    chars = []
    x = np.zeros((1000,))
    for sent in para:
        for char in sent:
            if char in common_chars:
                idx = common_chars.index(char)
                x[idx] += 1
    return x

In [47]:
from sklearn.naive_bayes import MultinomialNB
#Label preQin as 0s
X = np.zeros((12000, 1000))
y = np.zeros((12000,))
for i in range(6000):
    X[i] = unigrams(qin_train[i])
    X[6000 + i] = unigrams(ming_train[i])
    y[6000 + i] = 1
clf = MultinomialNB()
clf.fit(X, y)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
from sklearn.metrics import confusion_matrix
X_test = np.zeros((2776, 1000))
y_test = np.zeros((2776,))
for i in range(1388):
    X_test[i] = unigrams(qin_test[i])
    X_test[1388 + i] = unigrams(ming_test[i])
    y_test[1388 + i] = 1
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[1388,    0],
       [ 113, 1275]])

## Bigram Word Count Stats

In [51]:
def bigrams(para):
    pairs = []
    for sent in para:
        for i in range(len(sent) - 1):
            pairs.append((sent[i], sent[i+1]))
        pairs.append(('<S>', sent[0]))
        pairs.append((sent[-1], '</S>'))
    return pairs       

In [58]:
all_pairs = []
for para in ming_train:
    all_pairs.extend(bigrams(para))
for para in qin_train:
    all_pairs.extend(bigrams(para))
c = Counter(all_pairs)
common_pairs = [pair[0] for pair in c.most_common(2000)]

In [64]:
common_pairs[:40]

[('也', '</S>'),
 ('曰', ' '),
 (' ', '不'),
 ('道', ' '),
 ('也', ' '),
 ('之', ' '),
 ('者', ' '),
 ('來', ' '),
 ('了', '</S>'),
 ('之', '</S>'),
 ('子', ' '),
 ('人', ' '),
 ('矣', '</S>'),
 ('來', '</S>'),
 (' ', '我'),
 (' ', '一'),
 (' ', '你'),
 (' ', '只'),
 (' ', '以'),
 ('了', ' '),
 (' ', '而'),
 ('<S>', '不'),
 (' ', '有'),
 (' ', '則'),
 (' ', '大'),
 (' ', '又'),
 ('不', '知'),
 ('下', ' '),
 ('<S>', '那'),
 (' ', '是'),
 ('子', '曰'),
 (' ', '其'),
 ('<S>', '今'),
 ('時', ' '),
 ('說', ' '),
 ('西', '門'),
 ('去', ' '),
 ('而', '不'),
 ('去', '</S>'),
 ('天', '下')]

In [65]:
def bigrams_wrap(para):
    pairs = bigrams(para)
    x = np.zeros((2000,))
    for pair in pairs:
        if pair in common_pairs:
            x[common_pairs.index(pair)] += 1
    return x

In [66]:
X = np.zeros((12000, 2000))
y = np.zeros((12000,))
for i in range(6000):
    X[i] = bigrams_wrap(qin_train[i])
    X[6000 + i] = bigrams_wrap(ming_train[i])
    y[6000 + i] = 1
clf = MultinomialNB()
clf.fit(X, y)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [67]:
X_test = np.zeros((2776, 2000))
y_test = np.zeros((2776,))
for i in range(1388):
    X_test[i] = bigrams_wrap(qin_test[i])
    X_test[1388 + i] = bigrams_wrap(ming_test[i])
    y_test[1388 + i] = 1
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[1387,    1],
       [ 113, 1275]])