In [6]:
import sys
sys.path.append("../")
from src.eval_method import CrossValidation
from src.experiment import Experiment
from src.metrics import Accuracy,Precision,Recall,F1Score
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from src.preprocess import BasicTextPreprocess,BasicTextFeatureExtraction

In [11]:
## this should load a dataframe, which col1 being the documents and col2 being the labels.
from utils.data_util import get_raw_extended_promed_df
data_df_extended = get_raw_extended_promed_df()
data_df_extended.name = "extended_promed"
len(data_df_extended.index)

found 3430 files
Remain 3377 files after filter_out with alerting
found files: 3377
found 3430 files
Remain 53 files after filter_in with alerting
found files: 53
found 3862 files
found files: 3862


7292

In [12]:
data_df_extended.labels.value_counts()

0    3862
1    3377
2      53
Name: labels, dtype: int64

In [None]:
bp = BasicTextPreprocess()

# bfe = BasicTextFeatureExtraction(["tfidf","bow"])

bfe = BasicTextFeatureExtraction(["tfidf","bow","bigram",["tfidf","bigram"],["bow","bigram"],["tfidf","trigram"],["bow","trigram"]])

cv = CrossValidation(n_splits=10, n_repeats=10, random_state=12345)

acc = Accuracy()
pre = Precision()
rec = Recall()
f1 = F1Score()

svm = LinearSVC(random_state=0, tol=1e-5,max_iter=5000)
svm.name = "svm"
dtree = DecisionTreeClassifier(random_state=0,criterion='entropy')
dtree.name="dtree"
bnb = BernoulliNB()
bnb.name = "BernoulliNB"
gnb = GaussianNB()
gnb.name = "GaussianNB"
mnb = MultinomialNB()
mnb.name = "MultinomialNB"
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.name = "RandomForest"

# chi2 feature selection
ch2 = SelectKBest(chi2, k=9000)
svm_ch2 = Pipeline([('sel', ch2), ('cls', svm)])
svm_ch2.name = "svm+chi2"

# chi2 feature selection
dtree_ch2 = Pipeline([('sel', ch2), ('cls', dtree)])
dtree_ch2.name = "dtree+chi2"

# chi2 feature selection
bnb_ch2 = Pipeline([('sel', ch2), ('cls', bnb)])
bnb_ch2.name = "BernoulliNB+chi2"

# chi2 feature selection
gnb_ch2 = Pipeline([('sel', ch2), ('cls', gnb)])
gnb_ch2.name = "GaussianNB+chi2"

# chi2 feature selection
mnb_ch2 = Pipeline([('sel', ch2), ('cls', mnb)])
mnb_ch2.name = "MultinomialNB+chi2"

# chi2 feature selection
mnb_rf = Pipeline([('sel', ch2), ('cls', rf)])
mnb_rf.name = "RandomForest+chi2"

In [None]:
exp = Experiment(
    data_df = data_df_extended,
    preprocessor = bp,
    extractor = bfe,
    eval_method=cv,
    models=[svm,dtree,bnb,gnb,mnb,svm_ch2,dtree_ch2,bnb_ch2,gnb_ch2],
#     models = [svm,dtree],
    metrics=[acc,pre,rec,f1],
    result_file = "../promed_extended_result.csv",
    verbose = True,
    labels = [0,1],
)
exp.run()

In [10]:
for i in data_df_extended.index:
    print(i)
    if i >10:
        break

0
1
2
3
4
5
6
7
8
9
10
11
