In [1]:
import pandas as pd
import numpy as np
import ast
import spacy



In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
data_path = '../data/'
input_data = f'{data_path}/df_lab__extended_features.csv'


In [4]:
df_lab = pd.read_csv(input_data)

In [5]:
df_lab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 36 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Unnamed: 0                       6040 non-null   int64 
 1   Unnamed: 0.1                     6040 non-null   int64 
 2   Unnamed: 0.1.1                   6040 non-null   int64 
 3   Unnamed: 0.1.1.1                 6040 non-null   int64 
 4   drugName                         6040 non-null   object
 5   condition                        6040 non-null   object
 6   date                             6040 non-null   object
 7   use_case                         6040 non-null   object
 8   sentences                        6040 non-null   object
 9   sentences_length                 6040 non-null   int64 
 10  n_sentences                      6040 non-null   int64 
 11  sents                            6040 non-null   object
 12  doc_id                           6

In [6]:
sample = df_lab.sample(n=1)

In [7]:

sample

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,drugName,condition,date,use_case,sentences,sentences_length,...,feat_synt_and_indic,feat_freq_unigrams,feat_freq_dep_tupl,merged_features,metamap_feats,w2v_400,merged_features_with_embeddings,pos_repr,feat_pos_rep,merged_features_extended
2129,2151,2151,2151,5187,Liraglutide,"Diabetes, Type 2","July 15, 2015","Liraglutide::Diabetes,-Type-2",The reason I'm succeeding is because I decided...,107,...,[0 0 2 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 4...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 2, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[-0.077275 -0.429988 0.079353 0.037125 -0.03...,"[0, 0, 2, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, ...",DT NN PRP VBP VBG VBZ IN PRP VBD TO VB PRP$ NN...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 2, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, ..."


In [8]:
_ = sample.iloc[0]

In [9]:
def get_number_feats(row, column_name):
    _ = sample.iloc[0]
    n_feats = -1
    try:
        n_feats = len(ast.literal_eval(_[column_name]))
    except SyntaxError:
        n_feats = len(ast.literal_eval(_[column_name].replace(' ', ',').replace('\n', '')))
    return f'{column_name}::{n_feats}'
    

In [10]:
feature_columns = ['feat_synt', 
                   'feat_synt_and_indic', 
                   'feat_freq_unigrams', 
                   'feat_freq_dep_tupl', 
                   'merged_features', 
                   'metamap_feats',
                   #'w2v_400',
                   'merged_features_with_embeddings',
                   'feat_pos_rep',
                   'merged_features_extended']




In [11]:
for feat in feature_columns:
    print(get_number_feats(_, feat))

feat_synt::57
feat_synt_and_indic::181
feat_freq_unigrams::91
feat_freq_dep_tupl::2000
merged_features::2272
metamap_feats::127
merged_features_with_embeddings::2799
feat_pos_rep::5000
merged_features_extended::7799


In [12]:
181 - 57

124

In [13]:
# indicators


# spacy NER labels
# CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART

In [14]:
# metamap

tags_metamap = {elem.split('|')[0]:elem.split('|')[2].replace('\n', '') for elem in open('../../SemanticTypes_2018AB.txt', 'r').readlines()}
print(len(tags_metamap))
tags_metamap = pd.DataFrame(tags_metamap, index=['descript']).T
tags_metamap

127


Unnamed: 0,descript
aapp,"Amino Acid, Peptide, or Protein"
acab,Acquired Abnormality
acty,Activity
aggp,Age Group
amas,Amino Acid Sequence
...,...
tmco,Temporal Concept
topp,Therapeutic or Preventive Procedure
virs,Virus
vita,Vitamin


In [15]:
tags_metamap.to_excel('metamap_tags.xlsx')

In [16]:
def count_ents(text):
    """
    Counts 'quantities', 'time', 'person' references
    """
    doc = nlp(text)
    n_quantities = len([token for token in doc if token.ent_type_ in ["QUANTITY", "CARDINAL", "ORDINAL", "PERCENT"]])
    n_time = len([token for token in doc if token.ent_type_ in ["DATE", "TIME"]])
    n_person = len([token for token in doc if token.ent_type_ in ["PERSON"]])
    return [n_quantities, n_time, n_person]

In [17]:
df_lab['reference_count'] = df_lab['sents'].apply(count_ents)

In [18]:
isinstance(3, list)

False

In [19]:
def concat_feats(row, list_cols):
    output = []
    for col in list_cols:
        if isinstance(row[col], int) or isinstance(row[col], float):
            output += [row[col]]
        elif isinstance(row[col], list):
            output += row[col]
        elif isinstance(row[col], str):
            output += list(ast.literal_eval(row[col]))
        else:
            output += list(row[col])
    return output
    

In [20]:
cols = ['merged_features_with_embeddings', 'sentences_length', 'n_sentences', 'reference_count']

df_lab['new_features'] = df_lab.apply(lambda row: concat_feats(row, cols), axis=1)

In [21]:
len(df_lab['new_features'].iloc[0])

2804

In [22]:
a = np.zeros(4)
np.append(a, a)

array([0., 0., 0., 0., 0., 0., 0., 0.])

In [84]:
def get_context_features__sum(df, row, window: int=2):
    doc_id = row['doc_id']
    n_sentences = row['n_sentences']
    df_ = df.query(f'doc_id == "{doc_id}"')
    pos_in_comment = list(df_['Unnamed: 0.1.1.1']).index(row['Unnamed: 0.1.1.1'])
    
    
    start = max(0, pos_in_comment - window)
    end = min(n_sentences - 2, pos_in_comment + window)
    
    #indeces = list(df_.index)
    
#    print(f"for pos: {pos_in_comment}::{list(range(start, end))}")
    try:
        df_ = df_.iloc[list(range(start, end))]
    except IndexError as e:
        print("error")
    
#    context = df_['new_features'].sum()
    added_context = [sum(x) for x in zip(*df_['new_features'])]
    
    return added_context
    

def get_context_features(df, row, window: int = 2):
    doc_id = row['doc_id']
    n_sentences = row['n_sentences']
    df_ = df.query(f'doc_id == "{doc_id}"')
    pos_in_comment = list(df_['Unnamed: 0.1.1.1']).index(row['Unnamed: 0.1.1.1'])
    n_features = len(df_['new_features'].iloc[0])
    


    before = pos_in_comment - window
    if before < 0:
        before = abs(before)
        previous = np.zeros(before * n_features)
        if window - before != 0:
            previous = np.append(previous, np.array(df.iloc[pos_in_comment - (window - before) :pos_in_comment]['new_features'].sum()))
    else:
        previous = np.array(df.iloc[pos_in_comment-window :pos_in_comment]['new_features'].sum())
    
    
    output = np.append(previous, row['new_features'])


    after = (len(df_) - 1) - (pos_in_comment + 2)
    if after < 0:
        after = abs(after)
        next = np.zeros(after * n_features)
        if window - after != 0:
            next = np.append(next, np.array(df.iloc[pos_in_comment:pos_in_comment + window - after]['new_features'].sum()))
    else:
        next = np.array(df.iloc[pos_in_comment :pos_in_comment + window]['new_features'].sum())
    
    

    output = np.append(output, next)

    try:
        assert len(output) == (2*window+1) * n_features
    except:
        print(f"Position of the sentence: {pos_in_comment} of {len(df_) - 1} => number of vectors added {len(output) / n_features}")
        print("prev ",len(previous))
        print("next ", len(next))
        print()

    return output
    
        



In [85]:
df_lab['new_features'].apply(len).unique()

array([2804])

In [83]:
df_lab.apply(lambda row: get_context_features(df_lab, row, 2), axis=1)

Position of the sentence: 0 of 7 => number of vectors added 5.000356633380885
prev  5609
next  5608

Position of the sentence: 7 of 7 => number of vectors added 5.000356633380885
prev  5608
next  5609

Position of the sentence: 0 of 4 => number of vectors added 5.000356633380885
prev  5609
next  5608

Position of the sentence: 4 of 4 => number of vectors added 5.000356633380885
prev  5608
next  5609

Position of the sentence: 0 of 9 => number of vectors added 5.000356633380885
prev  5609
next  5608

Position of the sentence: 9 of 9 => number of vectors added 5.000356633380885
prev  5608
next  5609

Position of the sentence: 0 of 12 => number of vectors added 5.000356633380885
prev  5609
next  5608

Position of the sentence: 12 of 12 => number of vectors added 5.000356633380885
prev  5608
next  5609

Position of the sentence: 0 of 8 => number of vectors added 5.000356633380885
prev  5609
next  5608

Position of the sentence: 8 of 8 => number of vectors added 5.000356633380885
prev  5608

KeyboardInterrupt: 

In [None]:
#row = df_lab.sample(n=1).iloc[0]
#si = get_context_features(df_lab, row)



In [202]:
df_copy = df_lab.copy()

In [217]:
df_lab['merged_context_feats'] = df_lab.apply(lambda row: get_context_features(df_copy, row), axis=1)

error


In [219]:
df_lab = df_lab.drop(columns=['merged_in_context_feats'])

In [222]:
# combining specific + context
df_lab['final_feats'] = df_lab['new_features'] + df_lab['merged_context_feats']

In [227]:
feat_matrix = df_lab[['final_feats']]['final_feats'].apply(pd.Series)

In [226]:
# apply the maximum absolute scaling in Pandas using the .abs() and .max() methods
def maximum_absolute_scaling(df):
    # copy the dataframe
    df_scaled = df.copy()
    # apply maximum absolute scaling
    for column in df_scaled.columns:
        df_scaled[column] = df_scaled[column]  / df_scaled[column].abs().max()
    return df_scaled


In [228]:
feat_matrix = maximum_absolute_scaling(feat_matrix)

In [231]:
feat_matrix = feat_matrix.fillna(0)

In [241]:
feat_matrix['agreed_labels'] = df_lab['agreed_labels']

In [265]:
si

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,Unnamed: 0.1.1.1,drugName,condition,date,use_case,sentences,sentences_length,...,merged_features,metamap_feats,w2v_400,merged_features_with_embeddings,pos_repr,feat_pos_rep,merged_features_extended,new_features,reference_count,merged_in_context_feats
5102,5148,5148,5148,2091,Oseltamivir,Influenza,"March 28, 2016",Oseltamivir::Influenza,"When my 3 year old was diagnosed with the flu,...",133,...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[ 0.053664 0.151317 -0.056288 0.007972 0.03...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","WRB PRP$ CD NN JJ VBD VBN IN DT NN , PRP$ NN V...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 3, 0]","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
5103,5149,5149,5149,2092,Oseltamivir,Influenza,"March 28, 2016",Oseltamivir::Influenza,"Within 48 hours, my 2 year old and 8 month old...",61,...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[-0.350981 -0.313750 0.183841 0.200549 -0.12...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, ...","IN CD NNS , PRP$ CD NN JJ CC CD NN JJ JJ NNS .","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, ...","[0, 8, 0]","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
5104,5150,5150,5150,2093,Oseltamivir,Influenza,"March 28, 2016",Oseltamivir::Influenza,I treated their flu with tamiflu within hours ...,63,...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[ 0.015818 0.047406 0.015052 0.172526 -0.06...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",PRP VBD PRP$ NN IN NN IN NNS IN NN NN .,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0]","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
5105,5151,5151,5151,2094,Oseltamivir,Influenza,"March 28, 2016",Oseltamivir::Influenza,"Less then 24 hours later, both boys have no fe...",71,...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[ 0.089381 -0.288136 -0.174792 -0.050514 -0.04...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, ...","JJR RB CD NNS RB , DT NNS VBP DT NN CC JJ NNS .","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, ...","[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 2, 0, 0, 0, ...","[0, 3, 0]","[0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0, ..."


In [None]:
# sentence_features + context_features(sum)
## justificación vector suma?


# concat + fill zeros

## 

# context

## different window sizes
## delante + detrás

In [243]:
feat_matrix.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5599,5600,5601,5602,5603,5604,5605,5606,5607,agreed_labels
0,0.0,0.0,0.1,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,...,0.216978,0.086742,0.200071,0.256325,0.215359,0.210526,0.0,0.222222,0.0,NON_RELATED
1,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.166667,...,-0.042054,-0.114757,0.14268,0.356702,0.375626,0.315789,0.0,0.555556,0.0,NON_RELATED
2,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.166667,...,0.064091,-0.090625,0.089804,0.397516,0.447412,0.421053,0.0,0.555556,0.0,NON_RELATED


In [259]:
feats = pd.DataFrame()
feats['feats'] = feat_matrix.drop(columns=['agreed_labels']).values.tolist()

In [266]:
df_lab['clean']

0       39 year old male, 1st colonoscopy.  I had alre...
1       39 year old male, 1st colonoscopy.  I had alre...
2       39 year old male, 1st colonoscopy.  I had alre...
3       39 year old male, 1st colonoscopy.  I had alre...
4       39 year old male, 1st colonoscopy.  I had alre...
                              ...                        
6035    Flu came on quickly on Saturday.  Docs office ...
6036    Flu came on quickly on Saturday.  Docs office ...
6037    I have had the flu twice now in the last 9 yea...
6038    I have had the flu twice now in the last 9 yea...
6039    I have had the flu twice now in the last 9 yea...
Name: clean, Length: 6040, dtype: object

In [279]:
feats['agreed_labels'] = df_lab['agreed_labels']
feats['sentence'] = df_lab['sentences']
feats['review'] = df_lab['clean']


In [280]:
feats.head(10)

Unnamed: 0,feats,agreed_labels,sentence,review
0,"[0.0, 0.0, 0.1, 0.0, 0.0, 0.05263157894736842,...",NON_RELATED,"39 year old male, 1st colonoscopy.","39 year old male, 1st colonoscopy. I had alre..."
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.05263157894736842,...",NON_RELATED,I had already read several of the reviews of t...,"39 year old male, 1st colonoscopy. I had alre..."
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.05263157894736842,...",NON_RELATED,Started 1st dose at 6pm night before procedur...,"39 year old male, 1st colonoscopy. I had alre..."
3,"[0.0, 0.0, 0.1, 0.0, 0.0, 0.05263157894736842,...",supporting,"No cramping, pain or discomfort whatsoever.","39 year old male, 1st colonoscopy. I had alre..."
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.05263157894736842,...",NON_RELATED,Took the second dose at midnight with similar ...,"39 year old male, 1st colonoscopy. I had alre..."
5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.05263157894736842,...",supporting,Finished getting ready of all the liquids an ...,"39 year old male, 1st colonoscopy. I had alre..."
6,"[0.0, 0.0, 0.1, 0.0, 0.0, 0.05263157894736842,...",supporting,"Aside from the obvious bad taste, the doses wo...","39 year old male, 1st colonoscopy. I had alre..."
7,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.05263157894736842,...",supporting,No side effects other than lost sleep prepping...,"39 year old male, 1st colonoscopy. I had alre..."
8,"[0.0, 0.0, 0.1, 0.0, 0.0, 0.05263157894736842,...",NON_RELATED,"Very effective, but.....\r\n1st dose at 6pm da...","Very effective, but.....\r\n1st dose at 6pm da..."
9,"[0.0, 0.0, 0.1, 0.0, 0.0, 0.05263157894736842,...",attacking,"Uncomfortable abdominal distension by 7pm, Nau...","Very effective, but.....\r\n1st dose at 6pm da..."


In [281]:
feats.to_csv('df_lab__feat_matrix_1.csv')