In [5]:
import pandas as pd
import numpy as np

## Preparing data in the format for ner tags

Original input data is in format :
* Each input article was given in seperated text file (article id in text file name), for train and dev sets
* With the same article id in label file , we get annotation of propaganda spans:
    * article_id propaganda start_span end_span

    * article id 1111 is present in article_1111.txt and label is present in article_1111_TC.txt 
         * article_1111.txt : Trump the white president say he likes black people working from him .
         * article_1111_TC.txt : 
             * 1111 <propaganda type> 11 88

    
From this input we convert we covert the articles to files for each article ids , but in positions of span text , markings of start span and end span.This is done by using the scripts given by the competition organizer. For example : 

 * the output after processing looks like 
    
    article_1111.txt : Trump the <span-7> white president say he likes black people working from him. <7-/span> 
    
    the id 7 comes from type of propaganda. each propaganda has been given a seperated id .
    
From the above output the following preprocessing turns this article id into seperate sentences each having ner like labels
    
Example 
    
      file no : article_1111.txt
      
      Trump O
      the O
      white I 
      president I
      say I
      he I
      likes I
      black I
      people I
      working I
      from I
      him I
      . I


-----------------

Start loading all articles from independent text files into list of tup of (article_id, content)
content looks like this 

" Trump the white president say he likes black people working from him. <7-/span>"

In [6]:
import pandas as pd
import glob
import swifter
path = r'/data/semeval-2020/task-11/datasets/train-tagged_article/'
all_files = glob.glob(path + "/*.txt")

content_tuple = []

for filename in all_files:
    file = open(filename)
    content = "".join(file.readlines()).replace("\n"," ")
    article_id = filename.split("article")[-1].split(".txt")[0]
    content_tuple.append((article_id,content))

In [7]:
content_df = pd.DataFrame(content_tuple,columns=["article_id","content"])

In [8]:
content_df.head(3)

Unnamed: 0,article_id,content
0,762956953,Iran Admits To Aiding Al-Qaeda and Facilitatin...
1,787529309,The Last-Minute <span-11 Character Assassinati...
2,999001296,Altered Election Documents Tied To Florida Dem...


Now we write methods to help us convert the above text into ner style tagging .

get_propaganda_sequence :
    Takes in a each row of content_df and outputs for each words in the sentence into :
    
        * (word1,propaganda_id) if the word is in a span of propaganda 
        or 
        * (word1,0) if the word is not in a span of propaganda 
        
    There is no preprocessing done on each word / text , only the span marking such as <span-pid and pid-/span> are removed
    Only extra thing we do is we mark end of every sentence by appending the last word of the sentence with $$$$$ , while writing out as a file ,       we will remove these marking 

In [9]:
import re

from spacy.tokenizer import Tokenizer
from spacy.lang.en import English   # updated , 
from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated
tokenizer = Tokenizer(nlp.vocab)

def get_technique_id(token):
    results = {"start_token":[],"end_token":[]}

    if "<span" in token:
        results["start_token"] = [i.replace("<span-","") for i in re.findall("<span-\d+",token)]

    if "-/span>" in token:
        results["end_token"] = [i.replace("-/span>","") for i in re.findall("\d+-/span>",token)]

    # return [res for res_array in results_start_span for res in res_array]
    return results

def get_propaganda_sequence(row):
#     print(content)
    content_lable_tup = []
    content = row.content.replace("><", "> <").replace(">“<", "> <") \
        .replace(">(<", "> <").replace(">.<", "> <") \
        .replace(">\'<", "> <")
    label_seq = []
    label_propaganda_seq = []
    running_token = []
    for sent in nlp(content).sents:
        len_of_sentences = len(str(sent).split(" "))
        tokens = sent.text.split(" ")
        for ix,token in enumerate(tokens):
            token = token.strip()
            modified_token = token
            if "span" in token:
                if "-/span>" in token:
                    for i in get_technique_id(token)["end_token"]:
                        running_token.remove(i)
                    running_token = sorted(running_token)

                    rest_of_token = re.sub("\d+-/span>", "", token.strip())
                    if len(rest_of_token) > 1:
                        split_toks = [i.text for i in nlp(rest_of_token)]
                        for tok in split_toks:
                            if len(running_token) == 0:
                                label_propaganda_seq.append("O")
                                content_lable_tup.append((tok,"O"))
                            else:
                                label_propaganda_seq.append(",".join(running_token))
                                content_lable_tup.append((tok, ",".join(running_token)))

                if "<span" in token:
                    running_token = running_token + get_technique_id(token)["start_token"]
                    running_token = sorted(running_token)

                    rest_of_token = re.sub("<span-\d+", "", token.strip())
                    if len(rest_of_token) > 1:
                        split_toks = [i.text for i in nlp(rest_of_token)]
                        for tok in split_toks:
                            if len(running_token) == 0:
                                label_propaganda_seq.append("O")
                                content_lable_tup.append((tok, "O"))
                            else:
                                label_propaganda_seq.append(",".join(running_token))
                                content_lable_tup.append((tok, ",".join(running_token)))

            else:
                if len(token) > 0:
                    split_toks = [i.text for i in nlp(token)]
                    for tok in split_toks:
                        if len(running_token) == 0:
                            label_propaganda_seq.append("O")
                            content_lable_tup.append((tok, "O"))
                        else:
                            label_propaganda_seq.append(",".join(running_token))
                            content_lable_tup.append((tok, ",".join(running_token)))
            if ix == len_of_sentences -1:
                content_lable_tup[-1] = (content_lable_tup[-1][0] + "$$$$$$",content_lable_tup[-1][1])
                    
    new_content = [(re.sub("<span-\d+", "", re.sub("\d+-/span>", "", tup[0])) , tup[1]) for tup in content_lable_tup]

    seq_to_return = [i for i in new_content if i[0] != ""]

    return seq_to_return

Sample example of what this function does

In [10]:
content_df[content_df["article_id"] == "736231219"].iloc[0].content

'Farrakhan Speech:<span-11  \'Jews Are My Enemy,\'  11-/span>\'<span-9 White Folks 9-/span> Are Going Down\'  With the <span-9 leftist media 9-/span> entirely focused on the push to ban AR-15s and repeal the Second Amendment, practically no one noticed Louis Farrakhan\'s Saviours’ Day 2018 Address in which he told an approving audience that "<span-11 powerful Jews are my enemy," 11-/span> and "<span-9 white folks 9-/span> are going down," according to The Washington Examiner. Farrakhan, of course, is the <span-9 raging anti-Semite and race-monger 9-/span> who leads the Nation of Islam, the <span-9 loony, militant, black nationalist organization 9-/span> whose mission is to throw off the yoke of <span-9 the inferior white devil. 9-/span> This is the same Farrakhan with whom then-Sen. Barack Obama took a photo at a 2005 Congressional Black Caucus meeting, <span-13 a photo that was subsequently suppressed in order to protect Obama\'s political future.  13-/span>“Jews were responsible for 

In [11]:
class A:
    def __init__(self,content):
        self.content = content

sample_row = A(content_df[content_df["article_id"] == "736231219"].iloc[0].content)
        
get_propaganda_sequence(sample_row)

[('Farrakhan', 'O'),
 ('Speech', '11'),
 (':', '11'),
 ("'", '11'),
 ('Jews', '11'),
 ('Are', '11'),
 ('My', '11'),
 ('Enemy', '11'),
 (',', '11'),
 ("'", '11'),
 ('White', '9'),
 ('Folks', '9'),
 ('Are', 'O'),
 ('Going', 'O'),
 ('Down', 'O'),
 ("'", 'O'),
 ('With', 'O'),
 ('the', 'O'),
 ('leftist', '9'),
 ('media', '9'),
 ('entirely', 'O'),
 ('focused', 'O'),
 ('on', 'O'),
 ('the', 'O'),
 ('push', 'O'),
 ('to', 'O'),
 ('ban', 'O'),
 ('AR-15s', 'O'),
 ('and', 'O'),
 ('repeal', 'O'),
 ('the', 'O'),
 ('Second', 'O'),
 ('Amendment', 'O'),
 (',', 'O'),
 ('practically', 'O'),
 ('no', 'O'),
 ('one', 'O'),
 ('noticed', 'O'),
 ('Louis', 'O'),
 ('Farrakhan', 'O'),
 ("'s", 'O'),
 ('Saviours', 'O'),
 ('’', 'O'),
 ('Day', 'O'),
 ('2018', 'O'),
 ('Address', 'O'),
 ('in', 'O'),
 ('which', 'O'),
 ('he', 'O'),
 ('told', 'O'),
 ('an', 'O'),
 ('approving', 'O'),
 ('audience', 'O'),
 ('that', 'O'),
 ('powerful', '11'),
 ('Jews', '11'),
 ('are', '11'),
 ('my', '11'),
 ('enemy', '11'),
 (',', '11'),
 ('"',

-----------------

The 2nd helper function get_technique_id get all start token and end token from a given span

In [12]:
get_technique_id("<span-11")

{'start_token': ['11'], 'end_token': []}

----------------------------------------------

Next we use the helper function to do it for all articles

In [13]:
content_df["tagged_sequence_with_propaganda_types"] = content_df.swifter.apply(get_propaganda_sequence,axis=1)

  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=371, style=ProgressStyle(description_width…




In [14]:
content_df["tagged_sequence_with_propaganda_types"][1][:10]

[('The', 'O'),
 ('Last', 'O'),
 ('-', 'O'),
 ('Minute', 'O'),
 ('Character', '11'),
 ('Assassination', '11'),
 ('of', 'O'),
 ('Judge', 'O'),
 ('Kavanaugh', 'O'),
 ('Using', 'O')]

---------------------

As the first task in the competitoin is just to identify the start and end of span , we write a function to convert the about output into just propaganda or not 

Example a entry :

 ("Trump",11) turns into ("Trump",I)
 
 where 11 is the propaganda id

In [15]:
def propaganda_type_seq_to_simple_yes_or_no_seq(taggings):
    simple_taggings = []
    for i in taggings:
        if i[1] != "O":
            simple_taggings.append((i[0],"I"))
        else:
            simple_taggings.append(i)
    return simple_taggings        

In [16]:
def propaganda_type_seq_to_simple_yes_or_no_seq_bio(taggings):
    simple_taggings = []
    for ix,i in enumerate(taggings):
        if i[1] != "O" and taggings[ix-1][1] == "O" and ix > 1:
            simple_taggings.append((i[0],"B-I"))
        elif i[1] != "O":    
            simple_taggings.append((i[0],"I"))
        else:
            simple_taggings.append(i)
    return simple_taggings        

In [17]:
content_df["tagged_sequence"] = content_df.tagged_sequence_with_propaganda_types.swifter.apply(propaganda_type_seq_to_simple_yes_or_no_seq)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=371, style=ProgressStyle(description_width…




### Then we split the the above rows of articles into train and test to be used in our training 

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
train_df , test_df =  train_test_split(content_df,test_size=0.2)

In [20]:
train_tag_array = train_df.tagged_sequence.values.tolist()
test_tag_array = test_df.tagged_sequence.values.tolist()

We now create file for each of test and train , Only thing we additionaly do is remove $$$$$ we have added earlier for sentence end markings . Plus in the text file we add SOA when a new article starts and EOA when an article ends , we also add SOS at start of sentence and EOS at end of sentence.

In [21]:
final_string = ""
count = len(train_tag_array)
for i in train_tag_array:
    final_string += "SOA O\nSOS O\n"
    for k in i:
        final_string = final_string + k[0].replace("$$$$$$","") + " " + k[1] + "\n"
        if "$$$$$$" in k[0]:
            final_string = final_string + "EOS O\n\nSOS O\n"
    final_string += "EOA O\n\n"

In [22]:
with open('train_task.txt','w') as f:
    f.write(final_string)

In [23]:
test_final_string = ""
for i in test_tag_array:
    test_final_string += "SOA O\nSOS O\n"
    for k in i:
        test_final_string = test_final_string + k[0].replace("$$$$$$","") + " " + k[1] + "\n"
        if "$$$$$$" in k[0]:
            test_final_string = test_final_string + "EOS O\n\nSOS O\n"
    test_final_string += "EOA O\n\n"    

In [24]:
with open('dev_task.txt','w') as f:
    f.write(test_final_string)

## Prepare test dataset

Now for preparing the test set , we just split article into words and add SOA , EOA , SOS , EOS

In [25]:
dev_articles_path = "/data/semeval-2020/task-11/datasets/dev-articles"

In [26]:
import pandas as pd
import glob
import swifter
all_files = glob.glob(dev_articles_path + "/*.txt")

dev_content_tuple = []

for filename in all_files:
    file = open(filename)
    content = "".join(file.readlines()).replace("\n"," ")
    article_id = filename.split("article")[-1].split(".txt")[0]
    dev_content_tuple.append((article_id,content))

In [27]:
dev_content_tuple[0]

('779309765',
 "Unbelievable! Sharia New Mexico: Islamic compound jihadis RELEASED on bond after charges of “Islamophobia” and “racism” (Islam is not a race)  Editor's Note: Talk about injustice! There were remains of a 4-year-old boy found there, who allegedly died while they were performing some sort of Islamic ritual over him. The Bundys and their supporters got nearly two years in jail and no one hurt a single person! Shame on those who are supposed to uphold law and justice! Shame! Taos County Sheriff Jerry Hogrefe testified that they found children holding boxes of ammunition, and that one child was found with a gun…. Because sharia trumps dead children, school shooting training, kidnapping and jihad training. The children discovered at an “extremist Muslim” compound in New Mexico earlier this month were both trained to use firearms and taught multiple tactical techniques in order to kill teachers, law enforcement and other institution…. state prosecutors said on Monday. take our

In [28]:
len(dev_content_tuple)

75

In [29]:
from spacy.lang.en import English # updated
from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated

article_id_sentence = [(i[0],str(k)) for i in dev_content_tuple for k in nlp(i[1]).sents]

In [30]:
dev_article_text = ""
for i in dev_content_tuple:
    dev_article_text += "SOA\n"
    for sent in nlp(i[1]).sents:
        dev_article_text += "SOS\n"
        tokens = str(sent).split(" ")
#         tokens = tokenizer.tokenize(str(sent))
        for token in tokens:
            for tok in nlp(token):
                dev_article_text = dev_article_text + tok.text + "\n"
        dev_article_text = dev_article_text + "EOS\n\n"
    dev_article_text = dev_article_text + "EOA\n\n"    

In [31]:
# dev_article_text = ""
# for combo in article_id_sentence:
#     dev_article_text += "SOA O\n"
#     for token in combo[1].split(" "):
#         dev_article_text = dev_article_text + token + "\n"
#     dev_article_text = dev_article_text + "\n"    

In [32]:
with open('test_task.txt',"w") as f:
    f.write(dev_article_text)

In [33]:
dev_article_text.split("\n\n")[0]

'SOA\nSOS\nUnbelievable\n!\nEOS'

## Post processing for bert transformers



Out of any model will be similar where each line will contain word and label (I or O) seperated by a tab. We have to covert it into a article again using SOA , SOS , EOS and EOA.

In [34]:
final_prediction = open("final_prediction.txt").readlines()

In [35]:
article_ids = [tup[0] for tup in dev_content_tuple]

In [36]:
article_prediction = []
running_article = []
for line in final_prediction:
    if line.startswith("SOA") and len(running_article) != 0:
        article_prediction.append(running_article)
        running_article = []
    else: 
        running_article.append(line)
article_prediction.append(running_article)        

In [39]:
dev_content_tuple[-1]

('787966255',
 "Warrants Show Police Never Searched Amber Guyger’s Apartment, Now It’s Too Late, She’s ‘Vacated’ It  Dallas, TX — On Sunday, activists rallied around the Dallas Cowboy’s stadium to draw attention to the slaying of Botham Jean. They carried coffins and blocked traffic to expose the special treatment of Amber Guyger, the cop who has yet to even be fired for killing Jean. Now, as more details emerge, this special treatment has moved to a level that is even more insidious. Protest at AT&T Stadium pic.twitter.com/lu5VsGQByv — Allison Harris (@AllisonFox4News) September 16, 2018 take our poll - story continues below Who should replace Nikki Haley as our ambassador to the U.N.? Who should replace Nikki Haley as our ambassador to the U.N.? Who should replace Nikki Haley as our ambassador to the U.N.? * John Bolton Richard Grenell Dina Powell Heather Nauert Ivanka Trump Email * Email This field is for validation purposes and should be left unchanged. Completing this poll grants 

-----------------

After joining back the words into sentences and articles , we have make labels from word level to char level , for example :

 ("Trump","O")
 should be converted to 
 ("Trump,"OOOOO")
 
 This is necessary as we have to find the start span char index and end span char index

In [40]:
def create_equi_length_seq(line):
    cleaned_line = line.replace("\n","").split(" ")
    token = cleaned_line[0].strip()
    if len(token) == 0 or token.startswith("SOA") or token.startswith("SOS") or token.startswith("EOA") or token.startswith("EOS"):
        return (" "," ")
    token_label = cleaned_line[1].strip()
    len_token = len(token)
    full_token_label = []
    for i in range(0,len_token):
        full_token_label.append(token_label)
    return (token,"".join(full_token_label))

Few examples of this output is as follows

In [41]:
create_equi_length_seq('trust O\n')

('trust', 'OOOOO')

In [42]:
create_equi_length_seq(' \n')

(' ', ' ')

In [43]:
final_sequence_tups = []
running_article = []
for ap in article_prediction:
    for word_tup in ap:
        resized_tup = create_equi_length_seq(word_tup)
        if (resized_tup[0] != " "):
            running_article.append(resized_tup)
    final_sequence_tups.append(running_article)

In [44]:
final_prediction_seq = []
for article_words_tup in final_sequence_tups:
    label_seq = []
    for word_tup in article_words_tup:
        label_seq.append(word_tup[1])
    final_prediction_seq.append(" ".join(label_seq))
    label_seq = []

In [45]:
article_final_sequence = list(zip(article_ids,final_prediction_seq))

Once we have turned the char level , we have to get the propaganda spans from each article ,
the following function does this

In [46]:
def get_spans_from_label_seq(article_id_label_seq_tup):
    article_id = article_id_label_seq_tup[0]
    label_sequence = article_id_label_seq_tup[1]
    spans = []
    current_running_start_span = None
    is_span_running = False
    for ix,label in enumerate(label_sequence):
        if is_span_running:
            if label == "O":
                spans.append((current_running_start_span,ix-1))
                current_running_start_span = None
                is_span_running = False
        else:
            if label == "I":
                current_running_start_span = ix
                is_span_running = True
    return article_id, spans            

In [47]:
get_spans_from_label_seq(article_final_sequence[0])

('779309765', [(158816, 159110), (188194, 188281), (335245, 335661)])

Finally create a submission file and write it into a file

In [48]:
final_submission_list =  [get_spans_from_label_seq(article_tup) for article_tup in article_final_sequence]

In [49]:
final_submission_strings = []
for fsl in final_submission_list:
    article_id = fsl[0]
    for span in fsl[1]:
        final_submission_strings.append(str(article_id) + "\t" + str(span[0]) + "\t" + str(span[1]))

In [50]:
final_submission_content = "\n".join(final_submission_strings)

In [51]:
final_submission_content = "id\tbegin_offset\tend_offset\n" + final_submission_content

In [52]:
with open("final_submission.txt","w") as f:
    f.writelines(final_submission_content)

--------------------

### Compressing the output due to file size limit

The idea of compression is , for a article , if two spans are close to each other by n chars , we can combine them into one span , For the n char 
various values was tried and span_diff of 310 was just right to make the file size less 500 kb / 22000 lines

In [54]:
article_spans = open("final_submission.txt","r").readlines()

In [55]:
span_diff = 310
new_article_spans = []
for ix, article_span in enumerate(article_spans):
    if ix == 0:
        pass
    elif ix==1:
        new_article_spans.append(article_span.split("\t"))
    else:
        next_span_start_position =  int(article_span.split("\t")[1])
        prev_span_end_position = int(new_article_spans[-1][2])
        next_article_id =  int(article_span.split("\t")[0])
        prev_article_id = int(new_article_spans[-1][0])        
#         print(prev_span_end_position,next_span_start_position)
        if (next_article_id == prev_article_id):
            if (next_span_start_position - prev_span_end_position) < span_diff:
                to_insert_span = [new_article_spans[-1][0],new_article_spans[-1][1],article_span.split("\t")[2]]
                new_article_spans.pop()
                new_article_spans.append(to_insert_span)
            else:
                new_article_spans.append(article_span.split("\t"))
        else:
            new_article_spans.append(article_span.split("\t"))

In [56]:
print(len(new_article_spans))

225


In [57]:
article_span_strings = []
for article_span in new_article_spans:
    article_span_strings.append("\t".join(article_span))

In [58]:
condensed_final_submission = "".join(article_span_strings)

In [59]:
with open("condensed_final_submission.txt","w") as f:
    f.writelines(condensed_final_submission)