In [1]:
import os
import pandas as pd
import numpy as np
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
base_path = '../data/PubMed_20k_RCT'
dev_path = os.path.join(base_path, "train.txt")

In [3]:
lines = []
with open(dev_path, "r") as f:
    lines = f.readlines()
lines[:10]

['###24293578\n',
 'OBJECTIVE\tTo investigate the efficacy of 6 weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at 12 weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .\n',
 'METHODS\tA total of 125 patients with primary knee OA were randomized 1:1 ; 63 received 7.5 mg/day of prednisolone and 62 received placebo for 6 weeks .\n',
 'METHODS\tOutcome measures included pain reduction and improvement in function scores and systemic inflammation markers .\n',
 'METHODS\tPain was assessed using the visual analog pain scale ( 0-100 mm ) .\n',
 'METHODS\tSecondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index scores , patient global assessment ( PGA ) of the severity of knee OA , and 6-min walk distance ( 6MWD ) .\n',
 'METHODS\tSerum levels of interleukin 1 ( IL-1 ) , IL-6 , tumor necrosis factor ( TNF )

In [4]:
# preprocess dataset
def process_text(text):
    text = text.lower()
    text = "".join([i for i in text if i not in string.punctuation])
    # TODO maybe add stemming and lemmatisation?
    return text

labels = []
texts = []
tokens = []
for line in lines:
    if line.startswith("###") or line.isspace():
        continue
    split = line.split("\t")
    labels.append(split[0])
    text = process_text(split[1])
    texts.append(text)


df = pd.DataFrame({"label": labels, "text": texts})
print(df.shape)

(180040, 2)


In [6]:
label_encoder = LabelEncoder()
df["encoded_label"]  = label_encoder.fit_transform(df["label"])

x_train, x_val, y_train, y_val = train_test_split(df["text"], df["encoded_label"], test_size=0.1, random_state=42)

In [7]:
def print_metrics(pred_test, y_test, pred_train, y_train):
    print("test accuracy", str(np.mean(pred_test == y_test)))
    print("train accuracy", str(np.mean(pred_train == y_train)))
    print("\n Metrics and Confusion for SVM \n")
    print(metrics.confusion_matrix(y_test, pred_test))
    print(metrics.classification_report(y_test, pred_test))

In [8]:
pipeline = Pipeline([
    ("vect", TfidfVectorizer(stop_words="english")),
    ("chi", SelectKBest(chi2, k=2000)),
    ("clf", RandomForestClassifier())
])

pipeline.fit(x_train, y_train)
pred_train = pipeline.predict(x_train)
pred_val = pipeline.predict(x_val)
print_metrics(pred_val, y_val, pred_train, y_train)

test accuracy 0.7416685181070873
train accuracy 0.9985805623441705

 Metrics and Confusion for SVM 

[[1017  424  381  218  163]
 [ 203 1578  304   36  603]
 [  68   84 5345   36  398]
 [ 257  173  219  646   60]
 [  55  219  746    4 4767]]
              precision    recall  f1-score   support

           0       0.64      0.46      0.53      2203
           1       0.64      0.58      0.61      2724
           2       0.76      0.90      0.83      5931
           3       0.69      0.48      0.56      1355
           4       0.80      0.82      0.81      5791

    accuracy                           0.74     18004
   macro avg       0.70      0.65      0.67     18004
weighted avg       0.73      0.74      0.73     18004

