# Remarks
* The names of the tests and development set are inverted, i.e., we named the development set as 'test_df'.

# Install packages and Imports

In [144]:
!pip install sklearn-crfsuite



You should consider upgrading via the 'c:\users\martin\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.
ERROR: Could not find a version that satisfies the requirement sklearn-grid_search (from versions: none)
ERROR: No matching distribution found for sklearn-grid_search
You should consider upgrading via the 'c:\users\martin\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [169]:
pip install --user -U scikit-learn==0.23.2

Collecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp38-cp38-win_amd64.whl (6.8 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.23.2
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Martin\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
import spacy
import sklearn_crfsuite
import scipy.stats

from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

# Functions and Helpers

In [3]:
nlp = spacy.load("en_core_web_sm")
WINDOW = 9

def build_dataframe(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(line.strip().split('\t'))
    df = pd.DataFrame(data)

    df = df.rename(columns={0: 'file', 1: 'sentence', 2: 'position', 3: 'token', 4: 'negCue'})

    #drop empty rows
    df = df.drop(df[(df.file == '')].index)

    df.insert(len(df.columns)-1, 'lemma', None)
    df.insert(len(df.columns)-1, 'postag', None)

    for i in range(1,(WINDOW+1)):
        df.insert(len(df.columns)-1, f'lemma_a{i}', None)  #lemma of token 1-position AFTER  token
        df.insert(len(df.columns)-1, f'lemma_b{i}', None)  #lemma of token 1-position BEFORE  token

        df.insert(len(df.columns)-1, f'pos_a{i}', None)    #postag of token 1-position AFTER  token
        df.insert(len(df.columns)-1, f'pos_b{i}', None)    #postag of token 1-position BEFORE  token

    df.insert(len(df.columns)-1, 'known_cue', None)        #if token was seen as a cue during training

    df = df.astype({'sentence': 'int32'})
    df = df.astype({'position': 'int32'})

    #Encode golden label
    negCue_dict = {"negCue":     {"O": '0', "B-NEG": '1', "I-NEG": '2'}}
    df = df.replace(negCue_dict)    

    return df

def feature_extraction(dataframe,known_cues=[]):

    column_names = dataframe.columns
    new_df = pd.DataFrame(columns=column_names)

    for _file in dataframe.file.unique():
        file_df = dataframe[dataframe['file']==_file]

        file_df = file_df.groupby(['sentence']).apply(applySentenceGroupBy)

        new_df = new_df.append(file_df)

    dataframe = new_df

    dataframe = set_known_cue_feature(dataframe,known_cues)

    return dataframe

def set_known_cue_feature(df,cues):
    new_df = pd.DataFrame(columns=df.columns)
    
    for i,row in df.iterrows():
        row['known_cue'] = True if row['token'] in cues else False
        new_df = new_df.append(row,ignore_index=True)

    return new_df

def set_baseline_pred_feature(df,baseline_cues):
    new_df = pd.DataFrame(columns=df.columns)
    
    for i,row in df.iterrows():
        if row['token'] in baseline_cues['b_negs']:
            row['baseline_pred'] = 1 
        elif row['token'] in baseline_cues['i_negs']:
            row['baseline_pred'] = 2
        else:
            row['baseline_pred'] = 0

        new_df = new_df.append(row,ignore_index=True)

    return new_df
            
def applySentenceGroupBy(sentence_df):
    
    tokens = []
    for i,row in sentence_df.iterrows():
        tokens.append(row['token'])
    
    #reconstruct sentence from original df
    string = ' '.join([token for token in tokens])
    
    #use spacy to tokenize and extract info from sentence
    spacy_tokenized = nlp(string)
    
    sentence_df = add_token_features(sentence_df,spacy_tokenized,offset=1)    
    
    return sentence_df    

def add_token_features(sentence_df,spacy_tokenized,offset=1):
    i=0
    # tokens = [token for token in spacy_tokenized]
    
    sentence_ = pd.DataFrame(columns=sentence_df.columns)
    for idx,row in sentence_df.iterrows():
        
        # Both tokenizarions match:
        if row['token'] == spacy_tokenized[i].text:

            row = set_lemma_and_pos(row,spacy_tokenized,i)
        else:
            
            if row['token'] == spacy_tokenized[i-offset].text:
                row = set_lemma_and_pos(row,spacy_tokenized,i-offset)
            elif row['token'] == spacy_tokenized[i+offset].text:
                row = set_lemma_and_pos(row,spacy_tokenized,i+offset)
            else:
                #try one position more
                
                if row['token'] == spacy_tokenized[i-offset+1].text:
                    row = set_lemma_and_pos(row,spacy_tokenized,i-offset+1)
                elif row['token'] == spacy_tokenized[i+offset+1].text:
                    row = set_lemma_and_pos(row,spacy_tokenized,i+offset+1)                

        sentence_ = sentence_.append(row,ignore_index=True)
        i += 1
    
    return sentence_

def set_lemma_and_pos(row,spacy_tokenized,idx):
    #Lemma
    row['lemma']          = spacy_tokenized[idx].lemma_
    row['postag']         = spacy_tokenized[idx].tag_

    for i in range (1,(WINDOW+1)):
        row[f'lemma_b{i}']       = spacy_tokenized[idx-i].lemma_ if (idx + (1-i)) > 0 else None 
        row[f'lemma_a{i}']       = spacy_tokenized[idx+i].lemma_ if (idx+i) < len(spacy_tokenized) else None

        row[f'pos_b{i}']       = spacy_tokenized[idx-i].tag_ if (idx + (1-i)) > 0 else None 
        row[f'pos_a{i}']       = spacy_tokenized[idx+i].tag_ if (idx+i) < len(spacy_tokenized) else None

    return row

def get_training_cues(train_df):
    # print(train_df.head())
    cues= train_df.loc[(train_df['negCue']=='1')| (train_df['negCue']==1)]['token'].tolist()

    cues = list(set(cues)) #remove duplicates

    return cues

def dataframe2features(dataframe,known_cues=True,window=9):
    tokens = []
    labels = []
    for sentence_id, group in dataframe.groupby(['file','sentence']): #TODO arreglar esto para cuando saque el BREAK de files
        token = []
        label = []
        for _, row in group.iterrows():
            token_dict = {'token': row['token'],
                            'file': row['file'],
                            'sentence': row['sentence'],
                            'position': row['position']
                        }
            if known_cues:
                token_dict['known_cue'] = row['known_cue']
            
            if row['lemma'] != None:
                token_dict['lemma'] = row['lemma']
            if row['postag'] != None:
                token_dict['postag'] = row['postag']
            
            for i in range(1,window+1):
                if str(row[f'lemma_a{i}']) != 'nan':
                    token_dict[f'lemma_a{i}'] = row[f'lemma_a{i}']
                if str(row[f'lemma_b{i}']) != 'nan':
                    token_dict[f'lemma_b{i}'] = row[f'lemma_b{i}']                    
                if str(row[f'pos_b{i}']) != 'nan':
                    token_dict[f'pos_b{i}'] = row[f'pos_b{i}']                                        
                if str(row[f'pos_a{i}']) != 'nan':
                    token_dict[f'pos_a{i}'] = row[f'pos_a{i}']                                                            

            # if row['negCue'] != None:
            #     token_dict['negCue'] = row['negCue']


            token.append(token_dict)
            label.append(str(row['negCue']))
        tokens.append(token)
        labels.append(label)

    return tokens,labels

def train_crf_and_predict(X_train,y_train,X_test,y_test,c1=0.1,c2=0.1,verbose=True):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=c1,
        c2=c2,
        max_iterations=100,
        all_possible_transitions=True
    )

    crf.fit(X_train, y_train)

    labels = list(crf.classes_)
    sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))

    y_pred = crf.predict(X_test)
    y_pred_ = []

    for sentence_preds in y_pred:
        y_pred_.extend(sentence_preds)

    y_test = y_test.astype({'negCue': str})

    print(f"len(y_test)={len(y_test)}")
    print(f"len(y_pred_)={len(y_pred_)}")
    report = classification_report(y_test, y_pred_, target_names=sorted_labels,digits=4)
    if verbose: print(report)


    return crf,report

def get_sentences(df,filtered):

    sentences = filtered.sentence.unique()
    # print(sentences)
    sentences_list = []

    for sentence_id in sentences:
        tokens = []
        sentence = ""

        sentence_df = df.loc[(df['sentence']== sentence_id)]
        
        for idx, row in sentence_df.iterrows():
            
            tokens.append(row['token'])
        
        sentence = " ".join(tokens)

        sentences_list.append(sentence)
    
    for sentence in sentences_list:
        print(f"\n-{sentence}")


def get_sentences2(df,filtered):
    sentences_list = []
    
    for idx, filt_row in filtered.iterrows():
        token = filt_row['token']
        
        file_id = filt_row['file']
        sentence_id = filt_row['sentence']
        sentence_tokens = []
        # sentence = ""

        sentence_df = df.loc[(df['sentence']== sentence_id) & (df['file']== file_id)]
        
        for idx, row in sentence_df.iterrows():
            
            sentence_tokens.append(row['token'])
        
        sentence = " ".join(sentence_tokens)

        print(f"[token: {token}]")
        print(f"\t->{sentence}\n")

        # sentences_list.append(sentence)
        
    
    # for sentence in sentences_list:
    #     print(f"\n-{sentence}")        
    


# Pre-Processing and Feature Engineering (TRAIN AND TEST DFs)

## Preprocess & save

In [3]:
train_df = build_dataframe("corpus/SEM-2012-SharedTask-CD-SCO-training-simple.v2.txt")
test_df = build_dataframe("corpus/SEM-2012-SharedTask-CD-SCO-dev-simple.v2.txt")

known_cues = get_training_cues(train_df)

train_df = feature_extraction(train_df,known_cues)
test_df = feature_extraction(test_df,known_cues)

train_df.to_csv('corpus/train_df_preproc_w9.csv', sep='\t',index=False)
test_df.to_csv('corpus/test_df_preproc_w9.csv', sep='\t',index=False)

train_df = pd.read_csv('corpus/train_df_preproc_w9.csv', sep='\t')
test_df = pd.read_csv('corpus/test_df_preproc_w9.csv', sep='\t')

train_df

Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_a8,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue
0,baskervilles01,0,0,Chapter,chapter,NN,1,,CD,,...,,,,,,,,,False,0
1,baskervilles01,0,1,1.,,,,,,,...,,,,,,,,,False,0
2,baskervilles01,0,2,Mr.,Mr.,NNP,Sherlock,.,NNP,.,...,,,,,,,,,False,0
3,baskervilles01,0,3,Sherlock,Sherlock,NNP,Holmes,Mr.,NNP,NNP,...,,,,,,,,,False,0
4,baskervilles01,0,4,Holmes,Holmes,NNP,,Sherlock,,NNP,...,,,,,,,,,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65446,baskervilles14,270,58,slopes,slope,NNS,of,russet,IN,NN,...,,stretch,,VBD,,which,,WDT,False,0
65447,baskervilles14,270,59,of,of,IN,the,slope,DT,NNS,...,,away,,RB,,stretch,,VBD,False,0
65448,baskervilles14,270,60,the,the,DT,moor,of,NN,IN,...,,until,,IN,,away,,RB,False,0
65449,baskervilles14,270,61,moor,moor,NN,.,the,.,DT,...,,it,,PRP,,until,,IN,False,0


## Building Baseline

In [53]:
unique_tokens = sorted(test_df['token'].unique())
baseline_dict = {}

for token in unique_tokens:
    filtered = test_df.loc[(test_df['token']== token)]
    most_freq_label = filtered['negCue'].value_counts()[:1].index.tolist()[0]
    baseline_dict[token] = most_freq_label

baseline_cues = {}
b_negs = []
i_negs = []
for key,value in baseline_dict.items():
    if value == 1:
        b_negs.append(key)
    elif value == 2:
        i_negs.append(key)
    
baseline_cues['b_negs'] = b_negs
baseline_cues['i_negs'] = i_negs

test_df = set_baseline_pred_feature(test_df,baseline_cues)

test_df = test_df.astype({'baseline_pred': int})
test_df.to_csv('corpus/test_df_preproc_w9.csv', sep='\t',index=False)


# Read preprocessed DFs

In [9]:
train_df = pd.read_csv('corpus/train_df_preproc_w9.csv', sep='\t')
test_df = pd.read_csv('corpus/test_df_preproc_w9.csv', sep='\t')

test_df

Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue,baseline_pred
0,wisteria01,0,0,1.,,,,,,,...,,,,,,,,False,0,0
1,wisteria01,0,1,The,the,DT,Singular,.,NNP,.,...,,,,,,,,False,0,0
2,wisteria01,0,2,Singular,Singular,NNP,Experience,the,NNP,DT,...,,,,,,,,False,0,0
3,wisteria01,0,3,Experience,Experience,NNP,of,Singular,IN,NNP,...,,,,,,,,False,0,0
4,wisteria01,0,4,of,of,IN,Mr.,Experience,NNP,NNP,...,,,,,,,,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13562,wisteria02,439,9,orthodox,orthodox,JJ,in,very,IN,RB,...,so,,RB,,`,,``,False,0,0
13563,wisteria02,439,10,in,in,IN,his,orthodox,PRP$,JJ,...,you,,PRP,,so,,RB,False,0,0
13564,wisteria02,439,11,his,his,PRP$,ritual,in,NN,IN,...,see,,VBP,,you,,PRP,False,0,0
13565,wisteria02,439,12,ritual,ritual,NN,.,his,.,PRP$,...,our,,PRP$,,see,,VBP,False,0,0


# Training and Evaluation

In [9]:
print("#######################################################################################")
print("BASELINE PREDICTION")
    
# y_test = y_test.astype({'negCue': str})

print(classification_report(test_df['negCue'], test_df['baseline_pred'], digits=4))

print("#######################################################################################")
print("WINDOW = 9, KNOWN_CUES = TRUE")

train_df_w9_kc = train_df 
test_df_w9_kc  = test_df 

X_train_w9_kc, y_train = dataframe2features(train_df_w9_kc,window=9)
X_test_w9_kc,  y_test  = dataframe2features(test_df_w9_kc,window=9)

crf_w9_kc, report_w9_kc = train_crf_and_predict(X_train_w9_kc,y_train,X_test_w9_kc,test_df_w9_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 9, KNOWN_CUES = FALSE")

train_df_w9 = train_df.drop(columns=['known_cue'])
test_df_w9  = test_df.drop(columns=['known_cue'])

X_train_w9, y_train = dataframe2features(train_df_w9,known_cues=False,window=9)
X_test_w9,  y_test  = dataframe2features(test_df_w9,known_cues=False,window=9)

crf_w9, report_w9 = train_crf_and_predict(X_train_w9,y_train,X_test_w9,test_df_w9['negCue'])


print("#######################################################################################")
print("WINDOW = 8, KNOWN_CUES = TRUE")

train_df_w8_kc = train_df 
test_df_w8_kc  = test_df 

X_train_w8_kc, y_train = dataframe2features(train_df_w8_kc,window=8)
X_test_w8_kc,  y_test  = dataframe2features(test_df_w8_kc,window=8)

crf_w8_kc, report_w8_kc = train_crf_and_predict(X_train_w8_kc,y_train,X_test_w8_kc,test_df_w8_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 8, KNOWN_CUES = FALSE")

train_df_w8 = train_df.drop(columns=['known_cue'])
test_df_w8  = test_df.drop(columns=['known_cue'])

X_train_w8, y_train = dataframe2features(train_df_w8,known_cues=False,window=8)
X_test_w8,  y_test  = dataframe2features(test_df_w8,known_cues=False,window=8)

crf_w8, report_w8 = train_crf_and_predict(X_train_w8,y_train,X_test_w8,test_df_w8['negCue'])

print("#######################################################################################")
print("WINDOW = 7, KNOWN_CUES = TRUE")

train_df_w7_kc = train_df 
test_df_w7_kc  = test_df 

X_train_w7_kc, y_train = dataframe2features(train_df_w7_kc,window=7)
X_test_w7_kc,  y_test  = dataframe2features(test_df_w7_kc,window=7)

crf_w7_kc, report_w7_kc = train_crf_and_predict(X_train_w7_kc,y_train,X_test_w7_kc,test_df_w7_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 7, KNOWN_CUES = FALSE")

train_df_w7 = train_df.drop(columns=['known_cue'])
test_df_w7  = test_df.drop(columns=['known_cue'])

X_train_w7, y_train = dataframe2features(train_df_w7,known_cues=False,window=7)
X_test_w7,  y_test  = dataframe2features(test_df_w7,known_cues=False,window=7)

crf_w7, report_w7 = train_crf_and_predict(X_train_w7,y_train,X_test_w7,test_df_w7['negCue'])

print("#######################################################################################")
print("WINDOW = 6, KNOWN_CUES = TRUE")

train_df_w6_kc = train_df
test_df_w6_kc  = test_df

X_train_w6_kc, y_train = dataframe2features(train_df_w6_kc,window=6)
X_test_w6_kc,  y_test  = dataframe2features(test_df_w6_kc,window=6)

crf_w6_kc, report_w6_kc = train_crf_and_predict(X_train_w6_kc,y_train,X_test_w6_kc,test_df_w6_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 6, KNOWN_CUES = FALSE")

train_df_w6 = train_df.drop(columns=['known_cue'])
test_df_w6  = test_df.drop(columns=['known_cue'])

X_train_w6, y_train = dataframe2features(train_df_w6,known_cues=False,window=6)
X_test_w6,  y_test  = dataframe2features(test_df_w6,known_cues=False,window=6)

crf_w6, report_w6 = train_crf_and_predict(X_train_w6,y_train,X_test_w6,test_df_w6['negCue'])

print("#######################################################################################")
print("WINDOW = 5, KNOWN_CUES = TRUE")

train_df_w5_kc = train_df
test_df_w5_kc  = test_df

X_train_w5_kc, y_train = dataframe2features(train_df_w5_kc,window=5)
X_test_w5_kc,  y_test  = dataframe2features(test_df_w5_kc,window=5)

crf_w5_kc, report_w5_kc = train_crf_and_predict(X_train_w5_kc,y_train,X_test_w5_kc,test_df_w5_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 5, KNOWN_CUES = FALSE")

train_df_w5 = train_df.drop(columns=['known_cue'])
test_df_w5  = test_df.drop(columns=['known_cue'])

X_train_w5, y_train = dataframe2features(train_df_w5,known_cues=False,window=5)
X_test_w5,  y_test  = dataframe2features(test_df_w5,known_cues=False,window=5)

crf_w5, report_w5 = train_crf_and_predict(X_train_w5,y_train,X_test_w5,test_df_w5['negCue'])

print("#######################################################################################")
print("WINDOW = 4, KNOWN_CUES = TRUE")
    
train_df_w4_kc = train_df
test_df_w4_kc  = test_df

X_train_w4_kc, y_train = dataframe2features(train_df_w4_kc,window=4)
X_test_w4_kc,  y_test  = dataframe2features(test_df_w4_kc,window=4)

crf_w4_kc, report_w4_kc = train_crf_and_predict(X_train_w4_kc,y_train,X_test_w4_kc,test_df_w4_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 4, KNOWN_CUES = FALSE")

train_df_w4 = train_df.drop(columns=['known_cue'])
test_df_w4  = test_df.drop(columns=['known_cue'])

X_train_w4, y_train = dataframe2features(train_df_w4,known_cues=False,window=4)
X_test_w4,  y_test  = dataframe2features(test_df_w4,known_cues=False,window=4)

crf_w4, report_w4 = train_crf_and_predict(X_train_w4,y_train,X_test_w4,test_df_w4['negCue'])

print("#######################################################################################")
print("WINDOW = 3, KNOWN_CUES = TRUE")

train_df_w3_kc = train_df
test_df_w3_kc  = test_df

X_train_w3_kc, y_train = dataframe2features(train_df_w3_kc,window=3)
X_test_w3_kc,  y_test  = dataframe2features(test_df_w3_kc,window=3)

crf_w3_kc, report_w3_kc = train_crf_and_predict(X_train_w3_kc,y_train,X_test_w3_kc,test_df_w3_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 3, KNOWN_CUES = FALSE")

train_df_w3 = train_df.drop(columns=['known_cue'])
test_df_w3  = test_df.drop(columns=['known_cue'])

X_train_w3, y_train = dataframe2features(train_df_w3,known_cues=False,window=3)
X_test_w3,  y_test  = dataframe2features(test_df_w3,known_cues=False,window=3)

crf_w3, report_w3 = train_crf_and_predict(X_train_w3,y_train,X_test_w3,test_df_w3['negCue'])

print("#######################################################################################")
print("WINDOW = 2, KNOWN_CUES = TRUE")

train_df_w2_kc = train_df
test_df_w2_kc  = test_df

X_train_w2_kc, y_train = dataframe2features(train_df_w2_kc,window=2)
X_test_w2_kc,  y_test  = dataframe2features(test_df_w2_kc,window=2)

crf_w2_kc, report_w2_kc = train_crf_and_predict(X_train_w2_kc,y_train,X_test_w2_kc,test_df_w2_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 2, KNOWN_CUES = FALSE")

train_df_w2 = train_df.drop(columns=['known_cue'])
test_df_w2  = test_df.drop(columns=['known_cue'])

X_train_w2, y_train = dataframe2features(train_df_w2,known_cues=False,window=2)
X_test_w2,  y_test  = dataframe2features(test_df_w2,known_cues=False,window=2)

crf_w2, report_w2 = train_crf_and_predict(X_train_w2,y_train,X_test_w2,test_df_w2['negCue'])

print("#######################################################################################")
print("WINDOW = 1, KNOWN_CUES = TRUE")

train_df_w1_kc = train_df
test_df_w1_kc  = test_df

X_train_w1_kc, y_train = dataframe2features(train_df_w1_kc,window=1)
X_test_w1_kc,  y_test  = dataframe2features(test_df_w1_kc,window=1)

crf_w1_kc, report_w1_kc = train_crf_and_predict(X_train_w1_kc,y_train,X_test_w1_kc,test_df_w1_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 1, KNOWN_CUES = FALSE")

train_df_w1 = train_df.drop(columns=['known_cue'])
test_df_w1  = test_df.drop(columns=['known_cue'])

X_train_w1, y_train = dataframe2features(train_df_w1,known_cues=False,window=1)
X_test_w1,  y_test  = dataframe2features(test_df_w1,known_cues=False,window=1)

crf_w1, report_w1 = train_crf_and_predict(X_train_w1,y_train,X_test_w1,test_df_w1['negCue'])

print("#######################################################################################")
print("WINDOW = 0, KNOWN_CUES = TRUE")

train_df_w0_kc = train_df
test_df_w0_kc  = test_df

X_train_w0_kc, y_train = dataframe2features(train_df_w0_kc,window=0)
X_test_w0_kc,  y_test  = dataframe2features(test_df_w0_kc,window=0)

crf_w0_kc, report_w0_kc = train_crf_and_predict(X_train_w0_kc,y_train,X_test_w0_kc,test_df_w0_kc['negCue'])

print("#######################################################################################")
print("WINDOW = 0, KNOWN_CUES = FALSE")

train_df_w0 = train_df.drop(columns=['known_cue'])
test_df_w0  = test_df.drop(columns=['known_cue'])

X_train_w0, y_train = dataframe2features(train_df_w0,known_cues=False,window=0)
X_test_w0,  y_test  = dataframe2features(test_df_w0,known_cues=False,window=0)

crf_w0, report_w0 = train_crf_and_predict(X_train_w0,y_train,X_test_w0,test_df_w0['negCue'])


#######################################################################################
BASELINE PREDICTION
              precision    recall  f1-score   support

           0     0.9999    0.9993    0.9996     13388
           1     0.9459    0.9943    0.9695       176
           2     1.0000    0.3333    0.5000         3

    accuracy                         0.9991     13567
   macro avg     0.9819    0.7757    0.8230     13567
weighted avg     0.9992    0.9991    0.9991     13567

#######################################################################################
WINDOW = 9, KNOWN_CUES = TRUE
              precision    recall  f1-score   support

           0     0.9982    0.9994    0.9988     13388
           1     0.9503    0.8693    0.9080       176
           2     1.0000    0.6667    0.8000         3

    accuracy                         0.9976     13567
   macro avg     0.9828    0.8451    0.9023     13567
weighted avg     0.9976    0.9976    0.9976     13567

############

# Hyper parameter optimization

## Finding the best parameters

In [8]:
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.5),
}

labels = [0,1,2]
# use the same metric for evaluation
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='macro', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=100,
                        scoring=f1_scorer)


X_train_w7_kc, y_train = dataframe2features(train_df,window=7)
rs.fit(X_train_w7_kc, y_train)


print('best params:', rs.best_params_)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.9min finished


best params: {'c1': 0.9889626742960147, 'c2': 0.11115969864846051}


### Comparing with the non-optimized training

In [10]:
X_train_w7_kc, y_train = dataframe2features(train_df,window=7)
X_test_w7_kc,  y_test  = dataframe2features(test_df,window=7)

print("#######################################################################################")
print("EVALUATION ON DEVELOPMENT HYPER PARAMS = TRUE")
crf_w7_kc, report_w7_kc = train_crf_and_predict(X_train_w7_kc,y_train,X_test_w7_kc,test_df['negCue'],c1=0.1073,c2=0.0144)
print("#######################################################################################")
print("EVALUATION ON DEVELOPMENT HYPER PARAMS = FALSE")

crf_w7_kc, report_w7_kc = train_crf_and_predict(X_train_w7_kc,y_train,X_test_w7_kc,test_df['negCue'])


#######################################################################################
EVALUATION ON DEVELOPMENT HYPER PARAMS = TRUE
              precision    recall  f1-score   support

           0     0.9983    0.9993    0.9988     13388
           1     0.9448    0.8750    0.9086       176
           2     1.0000    0.6667    0.8000         3

    accuracy                         0.9976     13567
   macro avg     0.9810    0.8470    0.9025     13567
weighted avg     0.9976    0.9976    0.9976     13567

#######################################################################################
EVALUATION ON DEVELOPMENT HYPER PARAMS = FALSE
              precision    recall  f1-score   support

           0     0.9982    0.9995    0.9988     13388
           1     0.9563    0.8693    0.9107       176
           2     1.0000    0.6667    0.8000         3

    accuracy                         0.9977     13567
   macro avg     0.9848    0.8452    0.9032     13567
weighted avg     0.9977 

# Error Analysis

## Pre-process eval datasets

In [None]:
# eval_df_1 = build_dataframe("corpus/SEM-2012-SharedTask-CD-SCO-test-cardboard.txt")
# eval_df_2 = build_dataframe("corpus/SEM-2012-SharedTask-CD-SCO-test-circle.txt")

# known_cues = get_training_cues(train_df)

# eval_df_1 = feature_extraction(eval_df_1,known_cues)
# eval_df_2 = feature_extraction(eval_df_2,known_cues)

# eval_df_1.to_csv('corpus/eval_df1_preproc_w9.csv', sep='\t',index=False)
# eval_df_2.to_csv('corpus/eval_df2_preproc_w9.csv', sep='\t',index=False)

## Read preprocessed eval datasets

In [5]:
eval_df_1 = pd.read_csv('corpus/eval_df1_preproc_w9.csv', sep='\t')
eval_df_2 = pd.read_csv('corpus/eval_df2_preproc_w9.csv', sep='\t')

eval_df_1

Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_a8,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue
0,cardboard,0,0,In,in,IN,choose,,VBG,,...,the,,DT,,remarkable,,JJ,,False,0
1,cardboard,0,1,choosing,choose,VBG,a,in,DT,IN,...,remarkable,,JJ,,mental,,JJ,,False,0
2,cardboard,0,2,a,a,DT,few,choose,JJ,VBG,...,mental,,JJ,,quality,,NNS,,False,0
3,cardboard,0,3,few,few,JJ,typical,a,JJ,DT,...,quality,,NNS,,of,,IN,,False,0
4,cardboard,0,4,typical,typical,JJ,case,few,NNS,JJ,...,of,,IN,,my,,PRP$,,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10179,cardboard,495,16,answer,answer,NN,as,an,RB,DT,...,,which,,WDT,,to,,TO,False,0
10180,cardboard,495,17,as,as,RB,ever,answer,RB,NN,...,,human,,JJ,,which,,WDT,False,0
10181,cardboard,495,18,ever,ever,RB,.,as,.,RB,...,,reason,,NN,,human,,JJ,False,0
10182,cardboard,495,19,.,.,.,'',ever,'',RB,...,,be,,VBZ,,reason,,NN,False,0


## Train model WS-7 and make predictions on eval df

In [6]:
X_train_w7, y_train = dataframe2features(train_df,window=7)
X_dev1_w7,  y_dev1  = dataframe2features(eval_df_1,window=7)
X_dev2_w7,  y_dev2  = dataframe2features(eval_df_2,window=7)

model_eval1, _ = train_crf_and_predict(X_train_w7, y_train, X_dev1_w7, eval_df_1['negCue'],verbose=False)
model_eval2, _ = train_crf_and_predict(X_train_w7, y_train, X_dev2_w7, eval_df_2['negCue'],verbose=False)

y_pred1 = model_eval1.predict(X_dev1_w7)
y_pred1_ = []

for sentence_preds in y_pred1:
    y_pred1_.extend(sentence_preds)

eval_df_1['pred'] = y_pred1_
eval_df_1 = eval_df_1.astype({'negCue': str})

""" -------------------------------------------------"""

y_pred2 = model_eval2.predict(X_dev2_w7)
y_pred2_ = []

for sentence_preds in y_pred2:
    y_pred2_.extend(sentence_preds)

eval_df_2['pred'] = y_pred2_
eval_df_2 = eval_df_2.astype({'negCue': str})

len(y_test)=10184
len(y_pred_)=10184


  _warn_prf(average, modifier, msg_start, len(result))


len(y_test)=9032
len(y_pred_)=9032


## ANALYSIS

### FALSE NEGATIVES

In [126]:
""" ###############       EVAL 1        #####################  """
dv1_fn = eval_df_1.loc[ ( (eval_df_1['negCue'] == '1') & ((eval_df_1['pred']== '0') |(eval_df_1['pred']== '2')) ) | (eval_df_1['negCue'] == '2') & ((eval_df_1['pred']== '0') |(eval_df_1['pred']== '1') )]
get_sentences2(eval_df_1, dv1_fn)
dv1_fn


[token: unsolved]
	->He loved to lie in the very center of five millions of people , with his filaments stretching out and running through them , responsive to every little rumour or suspicion of unsolved crime .

[token: incredulity]
	->On my remarking that I was constantly in the habit of doing the same thing you expressed incredulity . ''

[token: far]
	->But I was still far from satisfied .

[token: from]
	->But I was still far from satisfied .

[token: injustice]
	->`` You do yourself an injustice .

[token: unframed]
	->Your eyes flashed across to the unframed portrait of Henry Ward Beecher which stands upon the top of your books .

[token: unacquainted]
	->The parcel was directed , then , by a man -- the printing is distinctly masculine -- of limited education and unacquainted with the town of Croydon .

[token: discoloured]
	->The other is a man 's , sun-burned , discoloured , and also pierced for an earring .

[token: undoubtedly]
	->This quarrel had put a stop to all communic

Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue,pred
347,cardboard,12,32,unsolved,unsolved,JJ,crime,of,NN,IN,...,responsive,,JJ,,",",,",",False,1,0
589,cardboard,23,17,incredulity,incredulity,NN,.,express,.,VBD,...,habit,,NN,,the,,DT,False,1,0
669,cardboard,27,4,far,far,RB,from,still,IN,RB,...,,,,,,,,False,1,0
670,cardboard,27,5,from,from,IN,satisfied,far,JJ,RB,...,,,,,,,,False,2,0
751,cardboard,31,5,injustice,injustice,NN,.,an,.,DT,...,,,,,,,,False,1,0
904,cardboard,41,6,unframed,unframed,JJ,portrait,the,NN,DT,...,,IN,,the,,DT,,False,1,0
2765,cardboard,138,21,unacquainted,unacquainted,JJ,with,and,IN,CC,...,be,,VBZ,,printing,,NN,False,1,0
3317,cardboard,167,9,discoloured,discolour,VBN,",",",",",",",",...,a,.,DT,,be,,VBZ,False,1,0
6238,cardboard,322,29,undoubtedly,undoubtedly,RB,have,would,VB,MD,...,a,.,DT,,address,,VB,False,1,0
6480,cardboard,331,1,unsuccessful,unsuccessful,JJ,lover,an,NN,DT,...,,NNP,,",",,",",,False,1,0


In [129]:
eval_df_1.loc[(eval_df_1['sentence']== 27)]

Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue,pred
665,cardboard,27,0,But,but,CC,I,,PRP,,...,,,,,,,,False,0,0
666,cardboard,27,1,I,I,PRP,be,but,VBD,CC,...,,,,,,,,False,0,0
667,cardboard,27,2,was,be,VBD,still,I,RB,PRP,...,,,,,,,,False,0,0
668,cardboard,27,3,still,still,RB,far,be,RB,VBD,...,,,,,,,,False,0,0
669,cardboard,27,4,far,far,RB,from,still,IN,RB,...,,,,,,,,False,1,0
670,cardboard,27,5,from,from,IN,satisfied,far,JJ,RB,...,,,,,,,,False,2,0
671,cardboard,27,6,satisfied,satisfied,JJ,.,from,.,IN,...,,,,,,,,False,0,0
672,cardboard,27,7,.,.,.,,satisfied,,JJ,...,,,,,,,,False,0,0


In [127]:
""" ###############       EVAL 2        #####################  """
dv2_fn = eval_df_2.loc[ ( (eval_df_2['negCue'] == '1') & ((eval_df_2['pred']== '0') |(eval_df_2['pred']== '2')) ) | (eval_df_2['negCue'] == '2') & ((eval_df_2['pred']== '0') |(eval_df_2['pred']== '1') )]
get_sentences2(eval_df_2, dv2_fn)
dv2_fn

[token: more]
	->` If not , I 'll have no more to do with you . '

[token: absolutely]
	->Do you say nothing has come out of that room -- absolutely nothing ? ''

[token: nothing]
	->Do you say nothing has come out of that room -- absolutely nothing ? ''

[token: more]
	->`` I 'll have no more of it !

[token: carpetless]
	->On the deal boards of the carpetless floor there was outlined a fresh track of blood .

[token: unconventional]
	->She spoke in rapid and fluent but very unconventional English , which , for the sake of clearness , I will make grammatical .

[token: dislike]
	->At first I thought that it was dislike .

[token: dislike]
	->And then , gradually , I understood that it was more than dislike .

[token: senseless]
	->He struck Gennaro senseless and fled from the house which he was never more to enter .

[token: more]
	->He struck Gennaro senseless and fled from the house which he was never more to enter .



Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue,pred
676,circle01,41,8,more,more,JJR,to,no,TO,DT,...,`,,``,,,,,False,2,0
1452,circle01,104,11,absolutely,absolutely,RB,nothing,--,NN,:,...,nothing,,NN,,say,,VB,False,1,0
1453,circle01,104,12,nothing,nothing,NN,?,absolutely,.,RB,...,have,,VBZ,,nothing,,NN,True,2,1
2634,circle01,196,5,more,more,JJR,of,no,IN,DT,...,,,,,,,,False,2,0
5939,circle02,65,6,carpetless,carpetless,JJ,floor,the,NN,DT,...,,IN,,blood,,NN,,False,1,0
7135,circle02,136,8,unconventional,unconventional,JJ,English,very,NNP,RB,...,she,IN,PRP,clearness,,NN,,False,1,0
7711,circle02,164,7,dislike,dislike,NN,.,be,.,VBD,...,,,,,,,,False,1,0
7725,circle02,165,12,dislike,dislike,NN,.,than,.,IN,...,",",,",",,gradually,,RB,False,1,0
8175,circle02,182,3,senseless,senseless,NN,and,Gennaro,CC,NNP,...,,VBD,,never,,RB,,False,1,0
8185,circle02,182,13,more,more,JJR,to,never,TO,RB,...,flee,,VBD,,and,,CC,False,2,0


### FALSE POSITIVES 

In [7]:
""" ###############       EVAL 1        #####################  """
dv1_fp = eval_df_1.loc[ ( (eval_df_1['negCue'] == '0') & ((eval_df_1['pred']== '1') |(eval_df_1['pred']== '2')) )]
get_sentences2(eval_df_1, dv1_fp)
dv1_fp


[token: unfortunately]
	->It is , however , unfortunately impossible entirely to separate the sensational from the criminal , and a chronicler is left in the dilemma that he must either sacrifice details which are essential to his statement and so give a false impression of the problem , or he must use matter which chance , and not choice , has provided him with .

[token: by]
	->Our blinds were half-drawn , and Holmes lay curled upon the sofa , reading and re-reading a letter which he had received by the morning post .

[token: nothing]
	->The box is a yellow , half-pound honeydew box , with nothing distinctive save two thumb marks at the left bottom corner .

[token: nothing]
	->The last six months that she was here she would speak of nothing but his drinking and his ways .

[token: without]
	->I 'm never without one or the other before me .

[token: never]
	->But she would have forgiven me ; she would have stuck as close to me as a rope to a block if that woman had never darkened ou

Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue,pred
52,cardboard,1,5,unfortunately,unfortunately,RB,impossible,",",JJ,",",...,,DT,,criminal,,NN,,True,0,1
218,cardboard,6,22,by,,,,,,,...,,,,,,,,True,0,1
2789,cardboard,140,11,nothing,nothing,NN,distinctive,with,JJ,IN,...,",",JJ,",",bottom,yellow,JJ,JJ,True,0,1
4518,cardboard,233,12,nothing,nothing,NN,but,of,CC,IN,...,that,,IN,,month,,NNS,True,0,1
7476,cardboard,376,3,without,without,IN,one,never,CD,RB,...,,,,,,,,True,0,1
7613,cardboard,382,25,never,never,RB,darken,have,VBN,VBD,...,rope,,NN,,a,,DT,True,0,1
8356,cardboard,413,16,nothing,nothing,NN,.,about,.,IN,...,irritable,,JJ,,more,,RBR,True,0,1


# PLAYGROUND

In [17]:
eval_df_1.loc[eval_df_1['negCue']=='2']


Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue,pred
670,cardboard,27,5,from,from,IN,satisfied,far,JJ,RB,...,,,,,,,,False,2,0


In [35]:
eval_df_1.loc[eval_df_1['lemma']=='unfortunately']


Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue,pred
52,cardboard,1,5,unfortunately,unfortunately,RB,impossible,",",JJ,",",...,,DT,,criminal,,NN,,True,0,1


In [34]:
train_df.loc[train_df['lemma']=='unfortunately']


Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_a8,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue
16982,baskervilles05,76,3,unfortunately,unfortunately,RB,",",",",",",",",...,you,,PRP,,.,,.,,True,1


In [24]:
# train_df.loc[train_df['']==]

train_df.loc[(train_df['lemma']=='nothing') & (train_df['negCue']==1)]

Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_a8,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue
771,baskervilles01,38,6,nothing,nothing,NN,of,be,IN,VBZ,...,'',,'',,,,,,True,1
4817,baskervilles02,70,18,nothing,nothing,NN,thereof,say,RB,VBP,...,'',and,'',CC,,Rodger,,NNP,True,1
6433,baskervilles02,134,28,nothing,nothing,NN,would,",",MD,",",...,the,he,DT,PRP,moor,although,NN,IN,True,1
7242,baskervilles03,6,4,nothing,nothing,NN,?,say,.,VBD,...,,,,,,,,,True,1
7706,baskervilles03,51,3,nothing,nothing,NN,?,find,.,VBD,...,,,,,,,,,True,1
9018,baskervilles03,130,6,nothing,nothing,NN,to,say,IN,VB,...,make,,VBN,,up,,RP,,True,1
11006,baskervilles04,10,1,Nothing,nothing,NN,of,`,IN,``,...,,,,,,,,,True,1
11334,baskervilles04,32,6,nothing,nothing,NN,supernatural,be,JJ,VBZ,...,?,,.,,'',,'',,True,1
12130,baskervilles04,72,16,nothing,nothing,NN,else,from,RB,IN,...,,and,,CC,,",",,",",True,1
12864,baskervilles04,111,1,Nothing,nothing,NN,",",`,",",``,...,down,,RP,,.,,.,,True,1


In [31]:
nothing_bneg = train_df.loc[(train_df['lemma']=='nothing') & (train_df['negCue']==1)]
# get_sentences2(train_df, nothing_bneg)


In [32]:
nothing_O = train_df.loc[(train_df['lemma']=='nothing') & (train_df['negCue']==0)]
get_sentences2(train_df, nothing_O)
nothing_O


[token: nothing]
	->He would talk of nothing but art , of which he had the crudest ideas , from our leaving the gallery until we found ourselves at the Northumberland Hotel .

[token: nothing]
	->I am certainly developing the wisdom of the serpent , for when Mortimer pressed his questions to an inconvenient extent I asked him casually to what type Frankland 's skull belonged , and so heard nothing but craniology for the rest of our drive .

[token: nothing]
	->I have not lived for years with Sherlock Holmes for nothing .

[token: nothing]
	->I hope to heaven that he has gone , for he has brought nothing but trouble here !



Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_a8,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue
15798,baskervilles05,3,4,nothing,nothing,NN,but,of,IN,IN,...,the,,DT,,crude,,JJS,,True,0
44265,baskervilles10,190,35,nothing,nothing,NN,but,hear,IN,VBD,...,drive,Frankland,NN,NNP,.,type,.,NN,True,0
44285,baskervilles10,191,10,nothing,nothing,NN,.,for,.,IN,...,,not,,RB,,have,,VBP,True,0
44409,baskervilles10,198,13,nothing,nothing,NN,but,bring,IN,VBN,...,,he,,PRP,,that,,IN,True,0


In [15]:
train_df.loc[(train_df['negCue']== 2)]


Unnamed: 0,file,sentence,position,token,lemma,postag,lemma_a1,lemma_b1,pos_a1,pos_b1,...,lemma_a8,lemma_b8,pos_a8,pos_b8,lemma_a9,lemma_b9,pos_a9,pos_b9,known_cue,negCue
883,baskervilles01,47,13,no,no,DT,means,by,NN,IN,...,,my,,PRP$,,",",,",",True,2
884,baskervilles01,47,14,means,means,NN,all,no,DT,DT,...,,dear,,JJ,,my,,PRP$,False,2
4393,baskervilles02,59,6,no,no,DT,mean,by,NNS,IN,...,",",,",",,the,,DT,,True,2
4394,baskervilles02,59,7,means,mean,NNS,advance,no,NN,DT,...,the,,DT,,bold,,JJS,,False,2
9751,baskervilles03,182,2,the,the,DT,contrary,on,NN,IN,...,.,,.,,'',,'',,False,2
9752,baskervilles03,182,3,contrary,contrary,NN,",",the,",",DT,...,'',,'',,,,,,False,2
10576,baskervilles03,236,24,than,than,IN,in,rather,IN,RB,...,,wait,,VBG,,he,,PRP,False,2
10682,baskervilles03,243,1,the,the,DT,contrary,on,NN,IN,...,avoid,,VBD,,the,,DT,,False,2
10683,baskervilles03,243,2,contrary,contrary,NN,",",the,",",DT,...,the,,DT,,moor,,NN,,False,2
14288,baskervilles04,197,2,for,for,IN,the,not,DT,RB,...,,,,,,,,,False,2
