In [5]:
import numpy as np
import os
import csv
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import precision_score, f1_score, recall_score
from sklearn.model_selection import GridSearchCV
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
import gensim
import multiprocessing
cores = multiprocessing.cpu_count()
param_grid = {'C': [0.1,1, 10, 100], 
                'gamma': [1,0.1,0.01,0.001, 'auto'],
                'kernel': ['poly', 'linear', 'rbf', 'sigmoid']}
def load():
    texts, train_texts, val_texts, train_labels = [], [], [], []
    train_root = "../data/Train_Textual/"
    val_root = "../data/Validation/"
    test_root = "../data/Test_Intuitive/"
    for filename in os.listdir(train_root):
        text = open(train_root+filename).read()
        text = preprocess(text)
        texts.append(text)
        train_texts.append(text)
        train_labels.append(filename[0])
    for filename in os.listdir(test_root):
        text = open(test_root+filename).read()
        text = preprocess(text)
        texts.append(text)
        train_texts.append(text)
        if filename[0] == 'N': train_labels.append('U')
        else: train_labels.append(filename[0])
    for filename in os.listdir(val_root):
        text = open(val_root+filename).read()
        text = preprocess(text)
        texts.append(text)
        val_texts.append(text)
    
    return texts, train_texts, val_texts, train_labels

def preprocess(text):
    # tokenize
    tokens = word_tokenize(text.lower())
    # get tag_map
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    final_tokens = []
    word_Lemmatized = WordNetLemmatizer()
    for token, tag in pos_tag(tokens):
        if token not in stopwords.words('english') and token.isalpha():
            token = word_Lemmatized.lemmatize(token, tag_map[tag[0]])
            final_tokens.append(token)
    return str(final_tokens)

"""
def NB(train_data_Tfidf, val_data_Tfidf, train_labels):
    naive = naive_bayes.MultinomialNB()
    naive.fit(train_data_Tfidf, train_labels)
    pred = naive.predict(val_data_Tfidf)
    count = 0
    with open('nb.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filename', 'Obesity'])
        val_root = "../data/Validation/" 
        for filename in os.listdir(val_root):
            writer.writerow([filename, pred[count]])
            count += 1
"""     
def get_word_vector(model, content):

    
    # vec = np.zeros(2).reshape((1, 2))
    vec = np.zeros(50).reshape((1, 50))
    count = 0
    #words = remove_some(words)
    for word in content[1:]:
        try:
            count += 1
            # vec += model[word].reshape((1, 2))
            vec += model.wv[word].reshape((1, 50))
            # print(vec)
        except KeyError:
            continue
    vec /= count
    return vec
def SVM(train_data_Tfidf, val_data_Tfidf, train_labels):
    # cross validation
    """
    SVM = GridSearchCV(svm.SVC(),param_grid,refit=True,verbose=2, scoring='f1')
    SVM.fit(train_data_Tfidf, train_labels)
    print(SVM.best_estimator_)
    print(SVM.best_score_)
    """
    SVM = svm.SVC(C=0.1, kernel='poly', gamma=0.1)
    SVM.fit(train_data_Tfidf, train_labels)
    # 用自己test
    pred = SVM.predict(train_data_Tfidf)
    print("SVM precision:", precision_score(pred, train_labels))
    print("SVM recall:", recall_score(pred, train_labels))
    print("SVM f1 score:", f1_score(pred, train_labels))
    # 輸出val_result
    pred = SVM.predict(val_data_Tfidf)
    count = 0
    with open('svm_doc2vec.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filename', 'Obesity'])
        val_root = "../data/Validation/" 
        for filename in os.listdir(val_root):
            writer.writerow([filename, pred[count]])
            count += 1

def training(texts, train_texts, val_texts, train_labels):
    Encoder = LabelEncoder()
    train_labels = Encoder.fit_transform(train_labels)
    """
    Tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
    Tfidf_vect.fit(texts)
    train_data_Tfidf = Tfidf_vect.transform(train_texts)
    val_data_Tfidf = Tfidf_vect.transform(val_texts)
    """
    x_train = []
    val = []
    if os.path.exists("./doc2vec_model_dmm") and os.path.exists("./doc2vec_model_dbow"):
        model_dmm = gensim.models.Doc2Vec.load('./doc2vec_model_dmm')
        model_dbow = gensim.models.Doc2Vec.load('./doc2vec_model_dbow')
    else:
        texts = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
        model_dmm = gensim.models.Doc2Vec(texts, dm=1, dm_mean=1, vector_size=300, window=10, min_count=1, negative=5, workers=cores, alpha=0.065, min_alpha=0.065)
        for epoch in range(30):
            model_dmm.train(texts, total_examples=len(texts), epochs=1)
            model_dmm.alpha -= 0.002
            model_dmm.min_alpha = model_dmm.alpha
        model_dmm.save("./doc2vec_model_dmm")
        model_dbow = gensim.models.Doc2Vec(texts, dm=0, vector_size=300, min_count=1, negative=5, hs=0, sample=0, workers=cores)
        for epoch in range(30):
            model_dbow.train(texts, total_examples=len(texts), epochs=1)
            model_dbow.alpha -= 0.002
            model_dbow.min_alpha = model_dbow.alpha
        model_dbow.save("./doc2vec_model_dbow")
    model = ConcatenatedDoc2Vec([model_dbow, model_dmm])
    #model = model_dbow


    for idx, docvec in enumerate(model.dv):
            #model.infer_vector(texts[idx], alpha=0.025)
            if idx < 800:
                x_train.append(docvec)
            else:
                val.append(docvec)
            if idx == 849:
                break
    #NB(x_train, train_label, x_test, test_label)
    x_train = np.array(x_train)
    x_train = x_train.squeeze()
    val = np.array(val)
    val = val.squeeze()
    #print(Tfidf_vect.get_feature_names_out())
    #NB(train_data_Tfidf, val_data_Tfidf, train_labels)
    SVM(x_train, val, train_labels)



In [3]:

texts, train_texts, val_texts, train_labels = load()
    


In [6]:
training(texts, train_texts, val_texts, train_labels)

SVM precision: 1.0
SVM recall: 1.0
SVM f1 score: 1.0
