In [8]:
import numpy as np
import os
import csv
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import precision_score, f1_score, recall_score
from sklearn.model_selection import GridSearchCV
from pprint import pprint
param_grid = {'C': [0.1,1, 10, 100], 
                'gamma': [1,0.1,0.01,0.001, 'auto'],
                'kernel': ['poly', 'linear']}
def load():
    texts, train_texts, val_texts, train_labels = [], [], [], []
    train_root = "../data/Train_Textual/"
    val_root = "../data/Validation/"
    test_root = "../data/Test_Intuitive/"
    for filename in os.listdir(train_root):
        text = open(train_root+filename).read()
        text = preprocess(text)
        texts.append(text)
        train_texts.append(text)
        if filename[0] == 'U' or filename[0] == 'N':
            train_labels.append(0)
        else:
            train_labels.append(1)
    for filename in os.listdir(test_root):
        text = open(test_root+filename).read()
        text = preprocess(text)
        texts.append(text)
        train_texts.append(text) 
        if filename[0] == 'U' or filename[0] == 'N':
            train_labels.append(0)
        else:
            train_labels.append(1)
    for filename in os.listdir(val_root):
        text = open(val_root+filename).read()
        text = preprocess(text)
        texts.append(text)
        val_texts.append(text)
    #print(train_labels)
    return texts, train_texts, val_texts, train_labels

def preprocess(text):
    # tokenize
    tokens = word_tokenize(text.lower())
    # get tag_map
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    final_tokens = []
    word_Lemmatized = WordNetLemmatizer()
    for token, tag in pos_tag(tokens):
        if token not in stopwords.words('english') and token.isalpha():
            token = word_Lemmatized.lemmatize(token, tag_map[tag[0]])
            final_tokens.append(token)
    return str(final_tokens)

"""
def NB(train_data_Tfidf, val_data_Tfidf, train_labels):
    naive = naive_bayes.MultinomialNB()
    naive.fit(train_data_Tfidf, train_labels)
    pred = naive.predict(val_data_Tfidf)
    count = 0
    with open('nb.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filename', 'Obesity'])
        val_root = "../data/Validation/" 
        for filename in os.listdir(val_root):
            writer.writerow([filename, pred[count]])
            count += 1
"""     

def SVM(train_data_Tfidf, val_data_Tfidf, train_labels):
    # Cross validation
    
    SVM = GridSearchCV(svm.SVC(),param_grid,refit=True,verbose=2, scoring='f1')
    SVM.fit(train_data_Tfidf, train_labels)
    print(SVM.best_estimator_)
    print(SVM.best_score_)
    
    #SVM = svm.SVC(C=0.1, kernel='poly', degree=3, gamma=1)
    #SVM = svm.SVC(C=1, kernel='poly', degree=3, gamma='auto')
    #SVM.fit(train_data_Tfidf, train_labels)
    # 用自己test
    pred = SVM.predict(train_data_Tfidf)
    print("SVM precision:", precision_score(pred, train_labels))
    print("SVM recall:", recall_score(pred, train_labels))
    print("SVM f1 score:", f1_score(pred, train_labels))
    # val
    pred = SVM.predict(val_data_Tfidf)
    count = 0
    with open('svm_TfidfVectorizertuning.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filename', 'Obesity'])
        val_root = "../data/Validation/" 
        for filename in os.listdir(val_root):
            writer.writerow([filename, pred[count]])
            count += 1
      
def training(texts, train_texts, val_texts, train_labels):
    #Encoder = LabelEncoder()
    #train_labels = Encoder.fit_transform(train_labels)
    Tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True)
    Tfidf_vect.fit(texts)
    train_data_Tfidf = Tfidf_vect.transform(train_texts)
    val_data_Tfidf = Tfidf_vect.transform(val_texts)
    print(Tfidf_vect.get_feature_names_out())
    #NB(train_data_Tfidf, val_data_Tfidf, train_labels)
    SVM(train_data_Tfidf, val_data_Tfidf, train_labels)




In [9]:

texts, train_texts, val_texts, train_labels = load()


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [10]:
training(texts, train_texts, val_texts, train_labels)

['aa' 'aaa' 'aaf' ... 'zydis' 'zyprexa' 'zyrtec']
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.2s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.3s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   1.3s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.2s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.1s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.2s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   1.1s
[CV] END ......................C=0.1, gamma=0.1, kernel=poly; total time=   1.