In [6]:
import numpy as np
import os
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import precision_score, f1_score, recall_score
from gensim.models.word2vec import Word2Vec
import gensim
def load():
    train_root = "../data/Train_Textual/"
    test_root = "../data/Test_Intuitive/"
    texts, labels = [], []
    for filename in os.listdir(train_root):
        text = open(train_root+filename).read()
        texts.append(preprocess(text))
        labels.append(filename[0])
    for filename in os.listdir(test_root):
        text = open(test_root+filename).read()
        texts.append(preprocess(text))
        if filename[0] == 'N': labels.append('U')
        else: labels.append(filename[0])
    return texts, labels

def preprocess(text):
    # tokenize
    tokens = word_tokenize(text.lower())
    # get tag_map
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    final_tokens = []
    word_Lemmatized = WordNetLemmatizer()
    for token, tag in pos_tag(tokens):
        if token not in stopwords.words('english') and token.isalpha():
            token = word_Lemmatized.lemmatize(token, tag_map[tag[0]])
            final_tokens.append(token)
    return str(final_tokens)

def NB(train_data_Tfidf, train_label, test_data_Tfidf, test_label):
    naive = naive_bayes.MultinomialNB()
    naive.fit(train_data_Tfidf, train_label)

    pred = naive.predict(test_data_Tfidf)
    print("NB accuracy:", precision_score(pred, test_label))

    print("NB f1 score:", f1_score(pred, test_label))

def SVM(train_data_Tfidf, train_label, test_data_Tfidf, test_label):
    SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(train_data_Tfidf, train_label)

    pred = SVM.predict(test_data_Tfidf)
    print("SVM precision:", precision_score(pred, test_label))
    print("SVM recall:", recall_score(pred, test_label))
    print("SVM f1 score:", f1_score(pred, test_label))

def get_word_vector(model, content):

    
    # vec = np.zeros(2).reshape((1, 2))
    vec = np.zeros(50).reshape((1, 50))
    count = 0
    #words = remove_some(words)
    for word in content[1:]:
        try:
            count += 1
            # vec += model[word].reshape((1, 2))
            vec += model.wv[word].reshape((1, 50))
            # print(vec)
        except KeyError:
            continue
    vec /= count
    return vec

def training(texts, labels):
    train_data, test_data, train_label, test_label = texts[:400], texts[400:800], labels[:400], labels[400:800]
    Encoder = LabelEncoder()
    train_label = Encoder.fit_transform(train_label)
    test_label =  Encoder.fit_transform(test_label)
    """
    Tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
    Tfidf_vect.fit(texts)
    train_data_Tfidf = Tfidf_vect.transform(train_data)
    test_data_Tfidf = Tfidf_vect.transform(test_data)
    """
    x_train = []
    x_test = []
    if os.path.exists("./model"):
        model = Word2Vec.load('./model')
    else:
        model = Word2Vec(texts, min_count=1, vector_size=50)  # 訓練skip-gram模型
        model.save("./model")
    for doc in train_data:
        x_train.append(get_word_vector(model, doc))
    for doc in test_data:
        x_test.append(get_word_vector(model, doc))
    #NB(x_train, train_label, x_test, test_label)
    x_train = np.array(x_train)
    x_test = np.array(x_test)
    x_train = x_train.squeeze()
    x_test = x_test.squeeze()
    SVM(x_train, train_label, x_test, test_label)




In [2]:

np.random.seed(500)
texts, labels = load()


In [7]:
training(texts, labels)

SVM precision: 0.78
SVM recall: 0.5652173913043478
SVM f1 score: 0.6554621848739495
