In [1]:
import pandas as pd
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
import os
import re

datadir = './abstrct/AbstRCT_corpus/data/train/neoplasm_train'
testdir = './abstrct/AbstRCT_corpus/data/test/neoplasm_test'

train_raw_files = [ f for f in os.listdir(datadir) if f.endswith('.txt') ]
train_ann_files = [ f for f in os.listdir(datadir) if f.endswith('.ann') ]

test_raw_files = [ f for f in os.listdir(testdir) if f.endswith('.txt') ]
test_ann_files = [ f for f in os.listdir(testdir) if f.endswith('.ann') ]


In [4]:
def extract_annotated(fpath):
    res = []
    with open(fpath, 'r') as infile:
        for row in infile.readlines():
            row = row.strip()
            if re.match('^T\d', row):
                name, annotation, text = row.split('\t')
                fname = os.path.basename(fpath).replace('.ann', '')    
                name = f'{fname}-{name}'
                is_arg, start, end = annotation.split()
                start = int(start)
                end = int(end)
                res.append((name, is_arg, start, end, text))
    return res

In [5]:
from collections import defaultdict

train_ann_dict = defaultdict(list)
test_ann_dict = defaultdict(list)

for f in train_ann_files:
    fname = f.replace('.ann', '')
    filepath = os.path.join(datadir, f)
    train_ann_dict[fname] = extract_annotated(filepath)
    
for f in test_ann_files:
    fname = f.replace('.ann', '')
    filepath = os.path.join(testdir, f)
    test_ann_dict[fname] = extract_annotated(filepath)

In [44]:
len(test_ann_dict)

100

In [45]:
len(train_ann_dict)

350

In [6]:
raw_dict = {}
for f in train_raw_files:
    with open(os.path.join(datadir, f), 'r') as infile:
        raw_dict[f.replace('.txt', '')] = infile.read().strip()
        
test_raw_dict = {}
for f in test_raw_files:
    with open(os.path.join(testdir, f), 'r') as infile:
        test_raw_dict[f.replace('.txt', '')] = infile.read().strip()

In [7]:
len(test_raw_dict)

100

In [8]:
len(raw_dict)

350

In [10]:
raw_dict['16294343']

'The majority of prostate carcinoma survivors experience enduring sexual difficulties and associated distress in the years after definitive treatment. A counseling intervention aimed at improving levels of sexual satisfaction and increasing successful utilization of medical treatment for erectile dysfunction (ED) was developed and pilot-tested for both the survivor of prostate carcinoma and his partner. All male participants were 3-month to 5-year survivors of localized prostate carcinoma who had been treated with radical prostatectomy or radiation therapy, and were married or in a committed relationship. Couples were randomized to attend four sessions of counseling together or to have the man attend alone. In both groups, partners completed behavioral homework. The sessions included education on prostate carcinoma and sexual function and options to treat ED as well as sexual communication and stimulation skills. Standardized questionnaires at baseline, posttreatment, and at 3-month an

In [12]:
def cosine_similarity(sent1, sent2):
    first = nltk.word_tokenize(sent1)  # from "Hello world" to ["Hello", "world"]
    second = nltk.word_tokenize(sent2)
    # print(first)
    l1 = []
    l2 = []
    # { } is a set in python: unique values
    x = {w for w in first if w not in stopwords.words('english')}
    y = {w for w in second if w not in stopwords.words('english')}
    # print(x)

    rvector = x.union(y)  # put everything together

    l1 = [1 if w in x else 0 for w in rvector]  # [1,1,0,0,1]
    l2 = [1 if w in y else 0 for w in rvector]  # [0,1,1,0,1]

    c = 0
    for i in range(len(rvector)):
        c += l1[i] * l2[i]

    cosine = c / float((sum(l1) * sum(l2))**0.5)
    return cosine

In [13]:
def extract_sentences(raw_dict, ann_dict):
    data = []
    for fname, raw_text in raw_dict.items():
        sentences = nltk.sent_tokenize(raw_text.strip())
        arguments = [x[1] for x in ann_dict[fname]]
        annotated = [x[4] for x in ann_dict[fname]]
        for sent1 in sentences:
            match = ''
            similarity = 0
            annotated_argument = ''
            for arg, sent2 in zip(arguments, annotated):
                cs = cosine_similarity(sent1, sent2)
                if cs > similarity and cs > 0.8:
                    similarity = cs
                    match = sent2
                    annotated_argument = arg
                    break
            
            if not match:
                is_argumentative = 0
            else:
                is_argumentative = 1
                
            data.append((sent1, is_argumentative, annotated_argument))
    return data
            
                
    
    
    
    
    
    

In [14]:
train_data = extract_sentences(raw_dict, train_ann_dict)

In [15]:
test_data = extract_sentences(test_raw_dict, test_ann_dict)

In [16]:
print(len(train_data))
print(len(test_data))

4418
1258


In [18]:
len(list(filter(lambda x: x[1] == 0, train_data)))

2471

In [19]:
train_data[0]

('The majority of prostate carcinoma survivors experience enduring sexual difficulties and associated distress in the years after definitive treatment.',
 0,
 '')

In [20]:
train = pd.DataFrame(train_data, columns=['text', 'label', 'arg'])

In [21]:
train.head()

Unnamed: 0,text,label,arg
0,The majority of prostate carcinoma survivors e...,0,
1,A counseling intervention aimed at improving l...,0,
2,All male participants were 3-month to 5-year s...,0,
3,Couples were randomized to attend four session...,0,
4,"In both groups, partners completed behavioral ...",0,


In [22]:
train['arg'].value_counts()

              2471
Premise       1318
Claim          572
MajorClaim      57
Name: arg, dtype: int64

In [23]:
test = pd.DataFrame(test_data, columns=['text', 'label', 'arg'])

In [24]:
test.head()

Unnamed: 0,text,label,arg
0,There are few reports about the course of vest...,0,
1,"In this study, we present prospectively collec...",0,
2,The aim was to measure the effect of GKRS comp...,0,
3,Secondary end points were postinclusion additi...,0,
4,The patients underwent magnetic resonance imag...,0,


In [39]:
def preprocess_text(text):
    tok = RegexpTokenizer(r'\w+')
    tokens = tok.tokenize(text)
    lemmatiser = WordNetLemmatizer()
    lemmas = [lemmatiser.lemmatize(t.lower(), pos='v') for t in tokens]
    return [k for k in lemmas if k not in stopwords.words('english')]

In [27]:
x_train, y_train = train['text'], train['label']
x_test, y_test = test['text'], test['label']

In [32]:
y_train.value_counts()

0    2471
1    1947
Name: label, dtype: int64

In [33]:
y_test.value_counts()

0    670
1    588
Name: label, dtype: int64

In [40]:
vectoriser = TfidfVectorizer(analyzer=preprocess_text)

In [41]:
x_train_tfidf = vectoriser.fit_transform(x_train)

In [42]:
x_train_tfidf.shape

(4418, 5336)

In [44]:
x_train_tfidf[0]

<1x5336 sparse matrix of type '<class 'numpy.float64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [45]:
sgd_clf = SGDClassifier(random_state=42)

In [46]:
sgd_clf_scores = cross_val_score(sgd_clf, x_train_tfidf, y_train, cv=5)

In [47]:
sgd_clf_scores

array([0.80656109, 0.7918552 , 0.8020362 , 0.82672707, 0.80747452])

In [48]:
print(f'{sgd_clf_scores.mean()} +/- {sgd_clf_scores.std() * 2}')

0.8069308148383494 +/- 0.022690178009372564


In [49]:
sgd_clf_pred = cross_val_predict(sgd_clf, x_train_tfidf, y_train, cv=5)

In [50]:
confusion_matrix(y_train, sgd_clf_pred)

array([[2077,  394],
       [ 459, 1488]])

In [52]:
grid = {'fit_intercept': [True, False],
        'early_stopping': [True, False],
        'loss': ['hinge', 'log', 'squared_hinge'],
        'penalty': ['l2', 'l1', 'none']
}

search = GridSearchCV(estimator=sgd_clf, param_grid=grid, cv=5)
search.fit(x_train_tfidf, y_train)



GridSearchCV(cv=5, estimator=SGDClassifier(random_state=42),
             param_grid={'early_stopping': [True, False],
                         'fit_intercept': [True, False],
                         'loss': ['hinge', 'log', 'squared_hinge'],
                         'penalty': ['l2', 'l1', 'none']})

In [53]:
search.best_params_

{'early_stopping': True,
 'fit_intercept': False,
 'loss': 'log',
 'penalty': 'l2'}

In [54]:
grid_sgd_clf_scores = cross_val_score(search.best_estimator_, x_train_tfidf, y_train, cv=5)

In [55]:
grid_sgd_clf_scores.mean()

0.8239052899668449