In [72]:
#Snorkel
import re
import pandas as pd
from snorkel.labeling import LabelingFunction
from snorkel.preprocess import preprocessor
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function

pd.set_option('display.max_colwidth', None)

## Labelling Functions (LFs)

In [49]:
GOOD = 1
BAD = 0
ABSTAIN = -1

#### Positive Sentiment LFs

- Adjectives
- Verbs

In [50]:
@labeling_function()
def lf_news_good_adjs(x):
    with open('./dicts/pos_adjectives.txt') as file:
        adjectives = [line.rstrip() for line in file]
    
    # Identificação dos adjetivos na frase
    for word in x.title.lower().split():
        if word in adjectives:
            return GOOD
    return ABSTAIN

@labeling_function()
def lf_news_good_verbs(x):
    with open('./dicts/pos_verbs.txt') as file:
        verbs = [line.rstrip() for line in file]
    
    for word in x.title.lower().split():
        if word in verbs:
            return GOOD
        
    return ABSTAIN

@labeling_function()
def lf_regex_dividendos(x):
    dividend_pattern = r".*pag.*dividendo.*|.*anunc.*dividendo.*|.*distrib.*dividendo.*"
    return GOOD if re.search(dividend_pattern, x.title.lower(), flags=re.I) else ABSTAIN

@labeling_function()
def lf_regex_resultado_positivo(x):
    raise_pattern = r"fech.*alta.*|.*abr.*alta.*|.*fech.*pos.*|.*abr.*pos.*|.*estre.*alta.*|.*prev.*alta.*" 
    return GOOD if re.search(raise_pattern, x.title.lower(), flags=re.I) else ABSTAIN

#### Negative Sentiment LFs

In [51]:
@labeling_function()
def lf_news_bad_adjs(x):
    # Lista Inicial de Adjetivos 
    with open('./dicts/neg_adjectives.txt') as file:
        adjectives = [line.rstrip() for line in file]

    # Identificação dos adjetivos na frase
    for word in x.title.lower().split():
        if word in adjectives:
            return BAD

    return ABSTAIN

@labeling_function()
def lf_news_bad_verbs(x):
    with open('./dicts/neg_verbs.txt') as file:
        verbs = [line.rstrip() for line in file]
    
    for word in x.title.lower().split():
        if word in verbs:
            return BAD
        
    return ABSTAIN

@labeling_function()
def lf_regex_resultado_negativo(x):
    fall_pattern = r"fech.*queda.*|.*abr.*queda.*|.*fech.*neg.*|.*abr.*neg.*|.*prev.*baixa.*|.*prev.*queda.*" 
    return BAD if re.search(fall_pattern, x.title.lower(), flags=re.I) else ABSTAIN

In [90]:
d = {'title': ['ibovespa negativo pior queda', 
               'petrobras anuncia data de divulgação de dividendos', 
               'economia fecha em queda a passos curtos no cenário internacional']
}

df = pd.DataFrame(data=d)

In [91]:
# combine all the labeling functions 
lfs = [
       lf_news_good_adjs, 
       lf_news_good_verbs,
       lf_regex_dividendos,
       lf_regex_resultado_positivo,
       lf_news_bad_adjs,
       lf_news_bad_verbs,
       lf_regex_resultado_negativo]

# apply the lfs on the dataframe
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df, progress_bar=False)

# apply the label model
label_model = LabelModel(cardinality=3, device='cpu', verbose=False)

# fit on the data
label_model.fit(L_snorkel)

# predict and create the labels
df['label'] = label_model.predict(L=L_snorkel).astype(str)

# convert to classes
dict_map = {'-1':'NEUTRAL', '1': 'POSITIVE', '0': 'NEGATIVE'}
df['label_class'] = df['label'].map(dict_map)

100%|██████████| 100/100 [00:00<00:00, 1670.07epoch/s]


In [92]:
df.head()

Unnamed: 0,title,label,label_class
0,ibovespa negativo pior queda,0,NEGATIVE
1,petrobras anuncia data de divulgação de dividendos,1,POSITIVE
2,economia fecha em queda a passos curtos no cenário internacional,0,NEGATIVE


In [86]:
# #Filtering out unlabeled data points
# df = df.loc[df.label_class.isin(['POSITIVE', 'NEGATIVE']), :]

# print ('Quantidade Total de Mensagens Rotuladas: ', df.shape[0])

# # find the label counts 
# df['label_class'].value_counts() / df.shape[0] * 100

Quantidade Total de Mensagens Rotuladas:  2


NEGATIVE    50.0
POSITIVE    50.0
Name: label_class, dtype: float64