## Install libraries

In [None]:
!pip install spacy==2.3.8
!python -m spacy download en
!pip install en_core_web_sm==2.3.1
!pip install seaborn==0.11.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==2.3.8
  Downloading spacy-2.3.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
Collecting thinc<7.5.0,>=7.4.1 (from spacy==2.3.8)
  Downloading thinc-7.4.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
Collecting wasabi<1.1.0,>=0.4.0 (from spacy==2.3.8)
  Downloading wasabi-0.10.1-py3-none-any.whl (26 kB)
Collecting srsly<1.1.0,>=1.0.2 (from spacy==2.3.8)
  Downloading srsly-1.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.8/209.8 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting catalogue<1.

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import base64
import string
import re
from collections import Counter
from nltk.corpus import stopwords
import spacy

import nltk
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Read data

In [None]:
df_p = pd.read_csv('DILIPositive.tsv', delimiter='\t', header=0)
df_p.head()

df_n = pd.read_csv('DILINegative.tsv', delimiter='\t', header=0)
df_n.head()

Unnamed: 0,PubMedID,Title,Abstract
0,4733,Huntington's chorea. Changes in neurotransmitt...,Neurotransmitter-receptor binding sites for ap...
1,19702,Natural history of lactic acidosis after grand...,To define the time course of the metabolic aci...
2,25385,A manpower policy for primary health care,A National Academy of Sciences study of policy...
3,61558,Multiple sclerosis cerebrospinal fluid produce...,To investigate the myelinotoxicity of cerebrop...
4,61560,Need for alpha-fetoprotein assays,An alpha fetoprotein assay is useful in diagno...


In [None]:
# Combine title and abstract
DATA=2     # 0: Title, 1: Abstract, 2: Title+Abstract
if DATA==0:
  df_p['Title_Abstract'] = df_p['Title'].fillna('') 
  df_n['Title_Abstract'] = df_n['Title'].fillna('')
elif DATA==1:
  df_p=df_p[~df_p['Abstract'].isnull()]
  df_p['Title_Abstract'] = df_p['Abstract'].fillna('') 

  df_n=df_n[~df_n['Abstract'].isnull()]
  df_n['Title_Abstract'] = df_n['Abstract'].fillna('')
else: 
  df_p['Title_Abstract'] = df_p['Title'] + (' ' + df_p['Abstract']).fillna('')
  df_n['Title_Abstract'] = df_n['Title'] + (' ' + df_n['Abstract']).fillna('')

df_p['Class']='Positive'
df_p_=df_p.iloc[:,3:5]
df_p_.head()

df_n['Class']='Negative'
df_n_=df_n.iloc[:,3:5]
df_n_.head()

frames = [df_p_, df_n_]
df=pd.concat(frames)

In [None]:
nlp = spacy.load('en_core_web_sm')
punctuations = string.punctuation

# Define function to cleanup text by removing personal pronouns, stopwords, and puncuation
def cleanup_text(docs, logging=False):
    texts = []
    counter = 1
    for doc in docs:
        if counter % 1000 == 0 and logging:
            print("Processed %d out of %d documents." % (counter, len(docs)))
        counter += 1
        doc = nlp(doc, disable=['parser', 'ner'])
        tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
        tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
        tokens = ' '.join(tokens)
        texts.append(tokens)
    return pd.Series(texts)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS 
from nltk.corpus import stopwords
from spacy.lang.en import English

spacy.load('en')
parser = English()

In [None]:
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]

In [None]:
# Tokenization
class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

## Word2vec

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec

class GensimWord2VecVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None,
                 sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5,
                 ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
                 trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False,
                 callbacks=(), max_final_vocab=None):
        self.size = size
        self.alpha = alpha
        self.window = window
        self.min_count = min_count
        self.max_vocab_size = max_vocab_size
        self.sample = sample
        self.seed = seed
        self.workers = workers
        self.min_alpha = min_alpha
        self.sg = sg
        self.hs = hs
        self.negative = negative
        self.ns_exponent = ns_exponent
        self.cbow_mean = cbow_mean
        self.hashfxn = hashfxn
        self.iter = iter
        self.null_word = null_word
        self.trim_rule = trim_rule
        self.sorted_vocab = sorted_vocab
        self.batch_words = batch_words
        self.compute_loss = compute_loss
        self.callbacks = callbacks
        self.max_final_vocab = max_final_vocab

    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            sentences=X, corpus_file=None,
            vector_size=self.size, alpha=self.alpha, window=self.window, min_count=self.min_count,
            max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed,
            workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs,
            negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
            hashfxn=self.hashfxn, epochs=self.iter, null_word=self.null_word,
            trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words,
            compute_loss=self.compute_loss, callbacks=self.callbacks,
            max_final_vocab=self.max_final_vocab)
        return self

    def transform(self, X):
        X_embeddings = np.array([self._get_embedding(words) for words in X])
        return X_embeddings

    def _get_embedding(self, words):
        #valid_words = [word for word in words if word in self.model_.wv.vocab]
        valid_words = [word for word in words if word in self.model_.wv.key_to_index ] # gensim >=4.0.0
        #print(valid_words)
        if valid_words:
            embedding = np.zeros((len(valid_words), self.size), dtype=np.float32)
            for idx, word in enumerate(valid_words):
                embedding[idx] = self.model_.wv[word]

            return np.mean(embedding, axis=0)
        else:
            return np.zeros(self.size)

## Training and test

In [None]:
# Data split
from sklearn.model_selection import train_test_split
from sklearn import feature_selection
from sklearn.decomposition import PCA
from scipy import sparse

gensim_word2vec_tr = GensimWord2VecVectorizer(size=200, min_count=5,  sg=1, alpha=0.025, iter=10)
                                              
x_df = df['Title_Abstract'].tolist()
y_df = df['Class'].tolist()

# Tokenization
vectorizer = TfidfVectorizer(tokenizer=tokenizeText, ngram_range=(1,1)) # TfidfVectorizer > CountVectorizer

# Pipeline
clf = LinearSVC(C=100)

pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])
pipe_FS = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
pipe_FS_clf = Pipeline([('clf', clf)])
pipe_w2v = Pipeline([('w2v', gensim_word2vec_tr)])

REPEAT=30 
acc_iter=[]
f1_iter=[]
recall_iter=[]
precision_iter=[]

for i in range(REPEAT):
  x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.33, random_state=i, stratify=y_df)
  
  # TF-IDF
  pipe_FS.fit(x_train)
  x_train_FS=pipe_FS.transform(x_train)
  x_test_FS=pipe_FS.transform(x_test)

  # word2vec
  clean_train=[]
  for j in range(len(x_train)):
    text=cleanText(x_train[j])
    text1=tokenizeText(text)
    clean_train.append(text1)

  clean_test=[]
  for j in range(len(x_test)):
    text=cleanText(x_test[j])
    text1=tokenizeText(text)
    clean_test.append(text1)

  pipe_w2v.fit(clean_train)
  clean_train_w2v=pipe_w2v.transform(clean_train)
  clean_test_w2v=pipe_w2v.transform(clean_test)

  #combined
  x_train_FS_dense=x_train_FS.todense()
  x_test_FS_dense=x_test_FS.todense()
  clean_train_w2v_m= np.asmatrix(clean_train_w2v)
  clean_test_w2v_m= np.asmatrix(clean_test_w2v)

  combined_train=np.concatenate((x_train_FS_dense, clean_train_w2v_m),axis=1)
  combined_test=np.concatenate((x_test_FS_dense, clean_test_w2v_m),axis=1)

  combined_train_sparse = sparse.csr_matrix(combined_train)
  combined_test_sparse = sparse.csr_matrix(combined_test)
  pipe_FS_clf.fit(combined_train_sparse, y_train)
  preds = pipe_FS_clf.predict(combined_test_sparse)

  print("accuracy:", accuracy_score(y_test, preds))
  acc_iter.append(accuracy_score(y_test, preds))
  
  tn, fp, fn, tp = confusion_matrix(y_test, preds, labels=['Negative', 'Positive']).ravel()
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  f1 = 2 * (precision * recall) / (precision + recall)

  precision_iter.append(precision)
  recall_iter.append(recall)
  f1_iter.append(f1)

In [None]:
print(acc_iter)
print(precision_iter)
print(recall_iter)
print(f1_iter)