### Discarding the rows with class label = No (Not checkWorthy) and NONE = True (no NER present)

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('ner_features.csv')

In [3]:
# demo_train_df = train_df[0:100]
train_df.columns

Index(['Unnamed: 0', 'Sentence_id', 'Text', 'class_label', 'CARDINAL', 'DATE',
       'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP',
       'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME',
       'WORK_OF_ART', 'NONE'],
      dtype='object')

In [4]:
train_df = train_df.drop('Unnamed: 0', axis = 1)

In [5]:
train_df.sample()

Unnamed: 0,Sentence_id,Text,class_label,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,...,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,NONE
19877,28852,"No, it doesn't speak for itself, Mr. Vice Pres...",No,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [6]:
# !pip install -U bitsandbytes
# !pip install transformers
# !pip install torch
# !pip install "accelerate>=0.26.0"
# !pip install bitsandbytes-cuda120
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [7]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
import torch
from transformers import BitsAndBytesConfig

In [8]:
model_name = "tiiuae/falcon-7b-instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
          model_name,
          quantization_config=quantization_config,
          #load_in_4bit=True,
          device_map = "auto")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

FalconForCausalLM(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4544)
    (h): ModuleList(
      (0-31): 32 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (query_key_value): Linear4bit(in_features=4544, out_features=4672, bias=False)
          (dense): Linear4bit(in_features=4544, out_features=4544, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
          (rotary_emb): FalconRotaryEmbedding()
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): Linear4bit(in_features=4544, out_features=18176, bias=False)
          (act): GELUActivation()
          (dense_4h_to_h): Linear4bit(in_features=18176, out_features=4544, bias=False)
        )
        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
    (rotary_emb): FalconRotaryEmbedding()
  )
  (lm_head): Linear(in_features=4544, out_features=65024, bi

In [9]:
print(torch.cuda.is_available())

True


In [10]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Device set to use cuda:0


In [10]:
import spacy

In [11]:
nlp = spacy.load("en_core_web_sm")

In [27]:
doc = nlp("Now, let's balance the budget and protect Medicare, Medicaid, education and the environment.")

print(len(doc.ents))
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

2
Medicare | ORG | Companies, agencies, institutions, etc.
Medicaid | ORG | Companies, agencies, institutions, etc.


In [16]:
from spacy import displacy

displacy.render(doc, style = "ent")



In [23]:
ner_labels = nlp.get_pipe("ner").labels
for s in ner_labels:
    print(s,"|", spacy.explain(s))

CARDINAL | Numerals that do not fall under another type
DATE | Absolute or relative dates or periods
EVENT | Named hurricanes, battles, wars, sports events, etc.
FAC | Buildings, airports, highways, bridges, etc.
GPE | Countries, cities, states
LANGUAGE | Any named language
LAW | Named documents made into laws.
LOC | Non-GPE locations, mountain ranges, bodies of water
MONEY | Monetary values, including unit
NORP | Nationalities or religious or political groups
ORDINAL | "first", "second", etc.
ORG | Companies, agencies, institutions, etc.
PERCENT | Percentage, including "%"
PERSON | People, including fictional
PRODUCT | Objects, vehicles, foods, etc. (not services)
QUANTITY | Measurements, as of weight or distance
TIME | Times smaller than a day
WORK_OF_ART | Titles of books, songs, etc.


In [28]:
def add_boolean_NER_features(row):
    features = ['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART', 'NONE']
    text = row['Text']
    doc = nlp(text)
    # print(doc.ents)
    # for ent in doc.ents:
    #     print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

    for f in features:
        row[f] = False
        
    if len(doc.ents) == 0:
        row['NONE'] = True
    else:
        for ent in doc.ents:
            if ent.label_ in features:
                row[ent.label_] = True
    return row
        
    

In [29]:
train_df = train_df.apply(add_boolean_NER_features, axis = 1)

In [30]:
train_df.head()

Unnamed: 0,Sentence_id,Text,class_label,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,...,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,NONE
0,30313,And so I know that this campaign has caused so...,No,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,19099,"Now, let's balance the budget and protect Medi...",No,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
2,33964,I'd like to mention one thing.,No,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,16871,I must remind him the Democrats have controlle...,Yes,False,False,False,False,False,False,False,...,True,False,True,False,False,False,False,False,False,False
4,13150,And to take a chance uh - now be - and not mak...,No,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [31]:
train_df[train_df['NONE'] == True]['class_label'].value_counts()


class_label
No     9896
Yes    1254
Name: count, dtype: int64

In [33]:
train_df.to_csv('ner_features.csv')

In [66]:
def ad_prompt_column(row):
    action_type = ['Physical_Action', 'Mental_Action', 'State_Change', 'Create_or_Destruction', 'Communication', 'Movement', 'Emotion_or_Feeling' , 'Perception', 'Linking(State_of_being)Verb', 'Other']
    noise_level = ['Relevant', 'Noisy']
    Intent = ['Statement_of_Fact', 'Opinion_or_Belief', 'Argument_or_Justification', 'Action_or_Instruction', 'No _Clear_Intent']
    
    sentiment = ['Positive', 'Negative', 'Other(Unclear/Noisy)']
    text = row['Text']
    prompt = f"Is the following text '{text}' Relevant or Noise. Tell me in one word"

    # prompt_for_finding_action_verb = f"Is the following text '{text}' contains any action verb, Yes or No. Tell me in one word"

    # prompt_for_catagory_of_sentence = f"Can you catagorize this following text '{text}' into one of this categories: Social, Financial, Governmental, Political, Commercial, Constitutional and Environmental"
    # f"\n1. Relevant"
    # f"\n2. Noisy"
    # f"\n Generate Only the choosen option: "
                
            #         Intent (Choose One):
            #         - Statement of Fact
            #         - Opinion or Belief
            #         - Argument or Justification
            #         - Action or Instruction
                
            #         Sentiment (Choose One):
            #         - Positive
            #         - Negative
            #         - Neutral
            #         - Other (Unclear/Noisy)
    
            #         Provide the response in a structured format like:
            #         Action Type: [Category]
            #         Noise Level: [Category]
            #         Intent: [Category]
            #         Sentiment: [Category]
            # """
    
    
    inputs = tokenizer(prompt, return_tensors = "pt").to(model.device)
    #print(inputs)

    outputs = model.generate(**inputs, max_new_tokens = 100, do_sample = False)
    #print(outputs)
    
    generate_result = tokenizer.decode(outputs[0], skip_special_tokens = True)
    
    row['Noise_text'] = False
    row['Relevant_text'] = False
    
    if 'Noise' in generate_result.split():
        row['Noise_text'] = True
    elif 'Relevant' in generate_result.split():
        row['Relevant_text'] = True
    return row

    # start_indx = generate_result.find("Action Type: ")

    # print(generate_result[start_indx:])

In [None]:
# sentence = "That's what we need more of, Candy."
# res = ad_prompt_column(sentence)
# if 'Noise' in res.split():
#     print(True)

train_df = train_df.apply(ad_prompt_column, axis = 1)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_tok

In [17]:
# full_prompt = (f"Classify the verb 'verb' into one of the following types:"
#                    f"\n1. Physical Actions"
#                    f"\n2. Mental Actions"
#                    f"\n3. Changes in State "
#                    f"\n4. Creation or Destruction"
#                    f"\n5. Communication"
#                    f"\n6. Movement"
#                    f"\n7. Emotion or Feeling"
#                    f"\n8. Perception"
#                    f"\n9. Linking verb"
#                    f"\n10. Other"
#                    f"\n Generate only the verb type.")
# full_prompt += "Verb type: "
# print(full_prompt)

Classify the verb 'verb' into one of the following types:
1. Physical Actions
2. Mental Actions
3. Changes in State 
4. Creation or Destruction
5. Communication
6. Movement
7. Emotion or Feeling
8. Perception
9. Linking verb
10. Other
 Generate only the verb type.Verb type: 


In [68]:
train_df.sample()

Unnamed: 0,Sentence_id,Text,class_label,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,...,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART,NONE
22435,23909,But he has made -- not admitted a mistake and ...,Yes,False,False,False,False,True,False,False,...,True,False,False,False,False,False,False,False,False,False
