In [None]:
import sys
sys.path.append("../")
from beta_nlp.utils.data_util import biocaster2df
from beta_nlp.eval_method import CrossValidation,TrainDevTest
from beta_nlp.experiment import Experiment
from beta_nlp.metrics import Accuracy,Precision,Recall,F1Score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from beta_nlp.preprocess import BasicTextPreprocess,BasicTextFeatureExtraction

In [None]:
import sys
sys.path.append("../")
data_file = "/home/zm324/workspace/doc_cls/datasets/biocaster/BioCaster.3.xml"
data_df = biocaster2df(data_file)
data_df["source"] = "Biocaster"

In [None]:
bp = BasicTextPreprocess()

# bfe = BasicTextFeatureExtraction(["tfidf","bow","bigram",["tfidf","bigram"],["bow","bigram"],["tfidf","trigram"],["bow","trigram"],["tfidf","bigram","trigram"]])
bfe = BasicTextFeatureExtraction(["tfidf"])


cv = CrossValidation(n_splits=10, n_repeats=10, random_state=12345)
# tdt = TrainDevTest()

acc = Accuracy(k=4)
pre = Precision(k=4)
rec = Recall(k=4)
f1 = F1Score(k=4)

svm = LinearSVC(random_state=0, tol=1e-5,max_iter=5000)
svm.name = "svm"
dtree = DecisionTreeClassifier(random_state=0,criterion='entropy')
dtree.name="dtree"
bnb = BernoulliNB()
bnb.name = "BernoulliNB"
gnb = GaussianNB()
gnb.name = "GaussianNB"
mnb = MultinomialNB()
mnb.name = "MultinomialNB"
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.name = "RandomForest"

# chi2 feature selection
ch2 = SelectKBest(chi2, k=9000)
svm_ch2 = Pipeline([('sel', ch2), ('cls', svm)])
svm_ch2.name = "svm+chi2"

# chi2 feature selection
dtree_ch2 = Pipeline([('sel', ch2), ('cls', dtree)])
dtree_ch2.name = "dtree+chi2"

# chi2 feature selection
bnb_ch2 = Pipeline([('sel', ch2), ('cls', bnb)])
bnb_ch2.name = "BernoulliNB+chi2"

# chi2 feature selection
gnb_ch2 = Pipeline([('sel', ch2), ('cls', gnb)])
gnb_ch2.name = "GaussianNB+chi2"

# chi2 feature selection
mnb_ch2 = Pipeline([('sel', ch2), ('cls', mnb)])
mnb_ch2.name = "MultinomialNB+chi2"

# chi2 feature selection
mnb_rf = Pipeline([('sel', ch2), ('cls', rf)])
mnb_rf.name = "RandomForest+chi2"

In [None]:
exp = Experiment(
    data_df = data_df,
    preprocessor = bp,
    extractor = bfe,
    eval_method=cv,
#     models=[svm,dtree,bnb,gnb,mnb,svm_ch2,dtree_ch2,bnb_ch2,gnb_ch2,mnb_ch2],
    models = [mnb],
    metrics=[acc,pre,rec,f1],
    result_file = "../results/traditional_biocaster_result_seed_2021.csv",
    verbose = True
).run()