In [1]:
!pwd

/nas/home/pkhanna/CQplus/PatternsLookup


In [112]:
import pandas as pd 

import os

import IPython
import hydra
import omegaconf
import json
import re
from tqdm import tqdm

from Patterns import PatternUtils

from snorkel.labeling import labeling_function

from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis



import logging
logger = logging.getLogger(__name__)

In [113]:
pd.set_option('display.max_colwidth', None)

In [114]:
FACT_REGEX = r'([a-zA-Z0-9_\-\\\/\+\* \'’%]{10,})'

REPLACEMENT_REGEX = {
        'action': FACT_REGEX,
        'precondition': FACT_REGEX,
        'negative_precondition': FACT_REGEX,
        'precondition_action': FACT_REGEX,
        'any_word': r'[^ \[]{,10}',
        'ENB_CONJ': r'(?:so|hence|consequently|thus|therefore|'
                    r'as a result|thus|accordingly|because of that|'
                    r'as a consequence|as a result)',
    }

# pattern = "{action} unless {precondition}"

NEGATIVE_WORDS = [
    ' not ',
    ' cannot ',
    'n\'t ',
    ' don\\u2019t ',
    ' doesn\\u2019t ',
]



SINGLE_SENTENCE_DISABLING_PATTERNS1 = [
    r"^{action} unless {precondition}\.",
    r"\. {action} unless {precondition}\.",
    r"^{any_word} unless {precondition}, {action}\.",
    r"^{any_word} unless {precondition}, {action}\.",
]

SINGLE_SENTENCE_DISABLING_PATTERNS2 = [
    r"{negative_precondition} (?:so|hence|consequently) {action}\.",
]

ENABLING_PATTERNS = [
    "{action} only if {precondition}.",
    "{precondition} (?:so|hence|consequently) {action}.",
    "{precondition} makes {action} possible.",
]

DISABLING_WORDS = [
    "unless",
]





ABSTAIN = -1
DISABLING = 0
ENABLING = 1


In [115]:
def pattern_exists(pattern,line):
    pattern_keys = re.findall(r'\{([^\}]+)}', pattern)
    replacements = {k: REPLACEMENT_REGEX[k] for k in pattern_keys}    
    regex_pattern = pattern.format(**replacements)
    m_list = re.findall(regex_pattern, line)
    
    
    for m in m_list:
        match_full_sent = line
        for sent in line:
            if all([ps in sent for ps in m]):
                match_full_sent = sent
    
        match_dict = dict(zip(pattern_keys, m))
        if 'negative_precondition' in pattern_keys:
                    if not(any([nw in match_dict['negative_precondition'] for nw in PatternUtils.NEGATIVE_WORDS])):
                        return False
    if len(m_list)>0:
        return True
    return False


In [138]:
@labeling_function()
def disabling1(x):
    for pat in SINGLE_SENTENCE_DISABLING_PATTERNS1:
        if pattern_exists(pat,x.text):
            return DISABLING
    return ABSTAIN


@labeling_function()
def disabling2(x):
    for pat in SINGLE_SENTENCE_DISABLING_PATTERNS2:
        if pattern_exists(pat,x.text):
            return DISABLING
    return ABSTAIN
        
        
@labeling_function()
def enabling_onlyif(x):
    pat="{action} only if {precondition}."
    if pattern_exists(pat,x.text):
        return ENABLING
    else:
        return ABSTAIN
        
@labeling_function()
def enabling_so_hence_conseq(x):
    pat="{precondition} (?:so|hence|consequently) {action}."
    if pattern_exists(pat,x.text):
        return ENABLING
    else:
        return ABSTAIN
              
@labeling_function()
def enabling_makespossible(x):
    pat="{precondition} makes {action} possible."
    if pattern_exists(pat,x.text):
        return ENABLING
    else:
        return ABSTAIN

In [139]:
omcs_df = pd.read_csv("/nas/home/pkhanna/omcs/omcs-sentences-more.txt", sep="\t", error_bad_lines=False)
omcs_df['text'] = omcs_df['text'].astype(str)

b'Skipping line 477941: expected 7 fields, saw 8\n'
b'Skipping line 769970: expected 7 fields, saw 8\nSkipping line 769975: expected 7 fields, saw 8\n'
b'Skipping line 1953542: expected 7 fields, saw 8\n'
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [140]:
omcs_df2=omcs_df.copy()

In [141]:
lfs = [disabling1, disabling2, enabling_onlyif, enabling_so_hence_conseq, enabling_makespossible]

In [142]:
# @hydra.main(config_path="../Configs", config_name="snorkel_config")
# def main(config: omegaconf.dictconfig.DictConfig):


applier = PandasLFApplier(lfs)
L_omcs = applier.apply(omcs_df)

print(LFAnalysis(L_omcs, lfs).lf_summary())

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_omcs, n_epochs=400, log_freq=50, seed=123)
omcs_df["label"] = label_model.predict(L=L_omcs, tie_break_policy="abstain")

# omcs_df = omcs_df[omcs_df.label != ABSTAIN]

# print(config.output_name)
# omcs_df.to_csv(config.output_name)



  from pandas import Panel
100%|██████████| 2072251/2072251 [05:04<00:00, 6803.49it/s]


                          j Polarity  Coverage      Overlaps     Conflicts
disabling1                0      [0]  0.000023  9.651340e-07  9.651340e-07
disabling2                1      [0]  0.000012  1.158161e-05  1.158161e-05
enabling_onlyif           2      [1]  0.000013  0.000000e+00  0.000000e+00
enabling_so_hence_conseq  3      [1]  0.000716  1.254674e-05  1.254674e-05
enabling_makespossible    4      [1]  0.000002  0.000000e+00  0.000000e+00


In [143]:
count = omcs_df["label"].value_counts()
print("Label  Count")
print(count)

Label  Count
-1    2070692
 1       1514
 0         45
Name: label, dtype: int64


In [17]:
L_omcs[:, 1] == DISABLING

array([False, False, False, ..., False, False, False])

In [48]:
omcs_df.iloc[L_omcs[:, 1] == DISABLING]['text'].sample(10, random_state=1)

786212                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            If you want to hear music then you should not listen so much to overly loud music.
909231                                                                                                                                                                                                                                                                                                                             

In [None]:
# if __name__ == '__main__':
#     main()

In [19]:
L_omcs

array([[-1, -1, -1, -1, -1],
       [-1, -1, -1, -1, -1],
       [-1, -1, -1, -1, -1],
       ...,
       [-1, -1, -1, -1, -1],
       [-1, -1, -1, -1, -1],
       [-1, -1, -1, -1, -1]])

In [20]:
from snorkel.analysis import get_label_buckets

array([-1, -1, -1, ..., -1, -1, -1])

In [147]:
buckets = get_label_buckets(L_omcs[:, 1], L_omcs[:, 3])
print(len(buckets))


3


In [148]:
len(omcs_df.iloc[buckets[(DISABLING, ENABLING)]])

24

In [56]:
#Testing with old RELEVANT-NOT_RELEVANT SCHEME

In [122]:

@labeling_function()
def is_a_kind_of(x):
    return NOT_RELEVANT if "is a kind of" in x.text.lower() else ABSTAIN

@labeling_function()
def single_sent_disabling_pat1(x):
    for pat in SINGLE_SENTENCE_DISABLING_PATTERNS1:
        if pattern_exists(pat,x.text):
            return RELEVANT
        else:
            return ABSTAIN


@labeling_function()
def single_sent_disabling_pat2(x):
    for pat in SINGLE_SENTENCE_DISABLING_PATTERNS2:
        if pattern_exists(pat,x.text):
            return RELEVANT
        else:
            return ABSTAIN

In [123]:
lfs2 = [single_sent_disabling_pat1, single_sent_disabling_pat2,is_a_kind_of]

In [124]:
ABSTAIN = -1
NOT_RELEVANT = 0
RELEVANT = 1

In [144]:
applier = PandasLFApplier(lfs2)
L_omcs2 = applier.apply(omcs_df2)

print(LFAnalysis(L_omcs2, lfs2).lf_summary())

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_omcs2, n_epochs=400, log_freq=50, seed=123)
omcs_df2["label"] = label_model.predict(L=L_omcs2, tie_break_policy="abstain")

  from pandas import Panel
100%|██████████| 2072251/2072251 [01:53<00:00, 18203.06it/s]


                            j Polarity  Coverage  Overlaps  Conflicts
single_sent_disabling_pat1  0      [1]  0.000022       0.0        0.0
single_sent_disabling_pat2  1      [1]  0.000012       0.0        0.0
is_a_kind_of                2      [0]  0.007130       0.0        0.0


In [126]:
count = omcs_df2["label"].value_counts()
print("Label  Count")
print(count)

Label  Count
-1    2057406
 0      14775
 1         70
Name: label, dtype: int64


In [129]:
tmp_disabling=[]
for i in tqdm(range(len(omcs_df))) :
    if omcs_df2.loc[i, "label"]== RELEVANT and  omcs_df.loc[i, "label"]!=DISABLING :
        tmp_disabling.append( omcs_df2.loc[i, "text"])
  

100%|██████████| 2072251/2072251 [00:28<00:00, 73252.70it/s]


In [130]:
print(len(tmp_disabling))

26


In [137]:
tmp_disabling

['Situation: What may not be so obvious is how early or how profoundly the quality of listening begins to shape character.',
 'The statement "fatty foods can make you over weight" is true because The body can\'t use all the fat you eat so it stores some of it and this causes weight gain.',
 'God is not capable of creating a stone so large He can not lift it.',
 "If you want to strike a match then you should take the match out of the match and then briskly rub it against the match striker thingy at the bottom of the matchbook.  I don't smoke so I'm not too good at this.",
 "People don't want to get injured implies that people would use a towel to dry their feet when coming out of a swimming pool so they don't slip and hurt themselves.",
 'The fact "A lot of UFO sightings take place in Nevada." is illustrated with the story:1. There was a light overhead from some type of aircraft.2. I couldn\'t identify the aircraft so I labelled it an unidentified flying object.3. Slang for an unidentif

In [109]:
old_relevant=omcs_df2[omcs_df2['label']==RELEVANT]['text']
len(old_relevant)

70

In [110]:
new_disabling=omcs_df[omcs_df['label']==DISABLING]['text']
len(new_disabling)

14775

In [111]:
omcs_df.equals(omcs_df2)

True

In [72]:
buckets[(DISABLING, ABSTAIN)]

array([ 288351,  292372,  315913,  315960,  322179,  346685,  356218,
        388020,  405038,  405156,  444508,  526092,  538873,  580547,
        593125,  608205,  611647,  613971,  615472,  615475,  617211,
        627355,  651230,  679126,  682420,  753745,  771643,  772080,
        785084,  786925,  794186,  826681,  848475,  850473,  852105,
        869354,  879880,  915718,  938607,  948261,  956306,  985002,
        986702,  990504,  995309, 1009710])

In [73]:
buckets = get_label_buckets(L_omcs[:, 1], L_omcs2[:, 2])
len(buckets[(ABSTAIN, RELEVANT)])    #single_pat_disabling1- DISABLING in new but ABSTAIN in old.

46

In [74]:
omcs_df.iloc[buckets[(DISABLING, ABSTAIN)]].sample(10, random_state=1)

Unnamed: 0,id,text,creator_id,created_on,language_id,activity_id,score,label
852105,927390,A contract is not legally valid unless it has been signed by all parties involved.,3834,2006-11-14 16:25:09.502985-05,en,27.0,1.0,1
986702,751504,you are not likely to find a jellyfish in the sea unless you look for a long time.,280,2006-11-14 15:26:25.982185-05,en,27.0,1.0,1
315913,1256935,Swimming pools have cold water in the winter unless they are heated.,8548,2006-11-14 18:02:40.213163-05,en,27.0,1.0,1
627355,1241371,Only having one shoe does you no good unless you only have one foot.,10824,2006-11-14 17:58:01.635563-05,en,27.0,1.0,1
315960,1256958,Reading about myths requires sight unless you can read Braille.,8548,2006-11-14 18:02:40.646305-05,en,27.0,1.0,1
785084,1011769,One should not promote friends into management positions unless they have appropriate management experience and are willing to openly disagree.,393,2006-11-14 16:49:57.266426-05,en,27.0,1.0,1
679126,1893930,A statement 'usually when you have a physical examination the doctor will tell you that there is nothing wrong with you' helps answer the question 'that is unless the doctor is a fraud and tells you you're ill so he can make money treating a fake illness.',6348,2009-04-14 22:20:33.101204-04,en,41.0,1.0,1
826681,963010,A man who is a sports fan cannot be truely happy unless he is amongst other men who share his enthusiasm.,4647,2006-11-14 16:35:37.477115-05,en,27.0,1.0,1
682420,1890449,people can do not normally kill themselves unless they cannot see any point in continuing to live. depression causes people to think there is no point to continuing to live,6348,2009-04-14 22:00:22.627317-04,en,41.0,1.0,1
651230,1072766,You probably shouldn't attempt to fix any electrical product unless you know exactly what your doing.,4647,2006-11-14 17:07:55.206733-05,en,27.0,1.0,1


In [136]:
buckets = get_label_buckets(L_omcs[:, 1], L_omcs2[:, 2])
len(buckets[(ENABLING, RELEVANT)])    #single_pat_disabling2- DISABLING in new but ABSTAIN in old.

KeyError: (1, 1)

Label  Count
-1    2057406
 0      14775
 1         70
Name: label, dtype: int64


In [79]:
count = omcs_df["label"].value_counts()
print("Label  Count")
print(count)

Label  Count
-1    2057406
 0      14775
 1         70
Name: label, dtype: int64
