The code provided below can be used for train, test and dev data. Only the path of the files will need to be changed and certain blocks will not be require to run in case of test data. Translation of languages has been done using Google Translator.

# Importing Libraries

In [102]:
import pandas as pd
import numpy as np
import os
import regex as re
import spacy as sy
import string
from urllib.parse import urlparse
from nltk.tokenize import TweetTokenizer
from deep_translator import GoogleTranslator


nlp_en = sy.load('en_core_web_sm')
all_stopwords = nlp_en.Defaults.stop_words




# Loading Labels from train-labels-subtask-1.txt and News Articles from train-articles-subtask-1 into separate dataframes

#### Multiple Languages are provided so we can shift between loading data of multiple languages by changing the path

In [None]:
# loading and reading train-labels-subtask-1.txt
filename = '/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/data/en/train-labels-subtask-1.txt'
train_sub1_df = pd.read_csv(filename, header=None, sep='\t', names=['id', 'genre'])
print(train_sub1_df.head())

In [104]:
# loading and reading all .txt files in train-articles-subtask-1
dir_name = '/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/data/en/train-articles-subtask-1'

article_df = pd.DataFrame()

numbers = list()
headlines = list()
articles = list()

for file in os.walk(dir_name):
    for filename in file[2]:
        
        number = re.findall('[0-9]+', filename)
        numbers.append(number[0])                       # appending the id to the list numbers
        x = dir_name + '/' + filename
        # print(number)
        article = ''
        with open(x) as f:
            lines = f.readlines()
            for i in range(len(lines)):
                if i==0:
                    headline = lines[0]
                    headlines.append(headline)          # appending the headline of the news articles to the list headlines
                elif lines[i]=="\n":
                    continue
                else:
                    # print()
                    article = article + '' + lines[i]   
        articles.append(article)                        # appending the main body of the new artcile to the list articles

# creating a dataframe with the columns id, headline and article
article_df['id'] = numbers                             
article_df['headlines'] = headlines
article_df['articles'] = articles

# converting the id column to int32
article_df = article_df.astype({'id': 'int32'})

In [105]:
train_df = train_sub1_df.set_index('id').join(article_df.set_index('id'))   # use for training and dev data
# train_df = article_df                                                       # use for test data
print(train_df.isnull().sum())                                              # checking for null values
train_df

id           0
headlines    0
articles     0
dtype: int64


Unnamed: 0,id,headlines,articles
0,3121,The Pope Says 'Humanity Must Repent For Abusin...,We humans “must repent and modify our lifestyl...
1,3135,Russian allies are unknowingly working against...,The Secretary of State for Defence described P...
2,3134,"I know this week has been disruptive, says LIZ...",The United Kingdom is the greatest country on ...
3,3120,Vladimir Putin facing Ukraine 'humiliation' as...,"VLADIMIR PUTIN is facing ""humiliation"" in Ukra..."
4,3136,What Are the Odds Putin is Bluffing About Usin...,Eurointelligence founder Wolfgang Münchau has ...
5,3122,Florida Surgeon General Issues Warning For mRN...,"Florida Surgeon General Joseph A. Ladapo, MD, ..."
6,3123,Biden's Secret Promise To OPEC Backfires: Shel...,"In early September, United States Secretary of..."
7,3137,Central Banks Add Gold for Fifth Straight Month\n,Central banks globally added to their net gold...
8,3133,US denies 'preposterous' claim they 'sabotaged...,It comes as Donald Trump suggested that 'World...
9,3127,"Did Uncle Sam, a.k.a. Wile E. Coyote, Blow Up ...",After “parties not-so-unknown” bombed the Nord...


In [None]:
# getting count of labels in the genre column, works only for train and dev data
print(train_df.genre.value_counts())

# Translating Other Data from other languages to english

In [None]:
# Function to translate the text to english
# y - column name
# lang - language of the text provided
# df - dataframe

def translation(y, lang, df):
    translated_val = list()
    
    # translating the text to english using GoogleTranslator API
    for x in df[y]:
        try:
            # GoogleTranslator API has a limit of 5000 characters per request, so splitting the text into chunks of 5000 characters and then translating it
            if len(str(x))<5000:
                
                translation = GoogleTranslator(source=lang, target='en').translate(x)
                translated_val.append(translation)
            
            elif len(str(x))>5000 and len(str(x))<10000:
                
                split_x = [x[i:i+4999] for i in range(0, len(x), 4999)]
                translation_1 = GoogleTranslator(source=lang, target='en').translate(split_x[0])
                translation_2 = GoogleTranslator(source=lang, target='en').translate(split_x[1])
                translation = translation_1 + translation_2
                translated_val.append(translation)
                
            elif len(str(x))>10000:
                
                split_x = [x[i:i+4999] for i in range(0, len(x), 4999)]
                translation_1 = GoogleTranslator(source=lang, target='en').translate(split_x[0])
                translation_2 = GoogleTranslator(source=lang, target='en').translate(split_x[1])
                translation_3 = GoogleTranslator(source=lang, target='en').translate(split_x[2])
                translation = translation_1 + translation_2 + translation_3
                translated_val.append(translation)
                
        except Exception as e:
            # if the text is not in the language provided, then it will return a nan value or text length is more than 15000 characters
            translated_val.append(np.nan)
    
    # replacing the original text with the translated text
    df[y] = translated_val
    
    # returning the updated dataframe
    return df

In [None]:
# translating the headlines and articles to english and creating a new dataframe
train_trans_df = translation('headlines', 'en', train_df)
train_trans_df = translation('articles', 'en', train_df)

In [None]:
# dropping the rows with nan values
print(train_trans_df.isna().sum())
train_trans_df.dropna(inplace=True)
print(train_trans_df.isna().sum())

genre        0
headlines    0
articles     0
dtype: int64
genre        0
headlines    0
articles     0
dtype: int64


In [None]:
train_df.head()

Unnamed: 0_level_0,genre,headlines,articles
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26136,opinion,The US is throwing its tentacles towards Russi...,“The Hand of the State Department”\nThe influe...
264,opinion,The Draghistan War,In the newspeak of the Orwellian Atlanticist c...
26248,opinion,"""Nazi speech worthy of Goebbels"", Orban advise...","Hegedus, a member of the Hungarian prime minis..."
26231,opinion,Landings and infections: the scourge of illega...,"Taranto, 13 July - The arrival at the port of ..."
26181,opinion,Super Mario's dilemma,"When the time for the irresponsible strikes, p..."


# Preprocessing the Translated Dataframes

In [106]:
def preprocessing(x, y, df):
    
    pos_tags_final_text = list()
    er_final_text = list()
    preprocessed_text = list()

    for x in df.loc[:,y]:

        tokenizer = TweetTokenizer()
        #tokenizing
        doc = tokenizer.tokenize(x)
        
        # removing links
        tokens = [token for token in doc if not urlparse(token).scheme]
        x = ' '.join(tokens)
        doc = nlp_en(x)
        
        # removing punctuation and white space
        tokens = [token.orth_ for token in doc if not token.is_punct | token.is_space]    
        x = ' '.join(tokens)
        
        # lower case
        x = x.lower()
        doc = nlp_en(x)

        # lemmatization
        tokens = [word.lemma_ for word in doc]   
        x = ' '.join(tokens)
        doc = nlp_en(x)  
        
        # removing punctuation and white space
        tokens = [token.orth_ for token in doc if not token.is_punct | token.is_space]    
        x = ' '.join(tokens)
        doc = nlp_en(x)
        
        # removing individual letters
        tokens = [word.text for word in doc if len(word)>=2]
        x = ' '.join(tokens)  
        # print(x)
        doc = nlp_en(x)
        
        # removing stop words
        tokens = [word for word in doc if not word in all_stopwords]
        list_of_strings  = [i.text for i in tokens]
        x = ' '.join(list_of_strings)
        doc = nlp_en(x)
        
        # #removing numbers
        x = ''.join([i for i in x if not i.isdigit()])
        doc = nlp_en(x)
        
        # Part of speech tagging
        pos_tags = [(i, i.tag_) for i in doc]
        pos_tags_final_text.append(pos_tags)
        
        # entity recognition tagging
        er =  [(i, i.label_, i.label) for i in doc.ents] 
        er_final_text.append(er)
    
        
        preprocessed_text.append(x)
        
    # return list with pos tags, entity recognition and preprocessed text
    return pos_tags_final_text, er_final_text, preprocessed_text

In [107]:
# preprocessing the headlines
pos_tags_final_headlines, er_final_headlines, preprocessed_headlines = preprocessing(headline, 'headlines', train_df)

train_df['preprocessed_headlines'] = preprocessed_headlines
train_df['pos_tags_headlines'] = pos_tags_final_headlines
train_df['er_tags_headlines'] = er_final_headlines

In [108]:
# preprocessing the articles
pos_tags_final_articles, er_final_articles, preprocessed_articles = preprocessing(article, 'articles', train_df)

train_df['preprocessed_articles'] = preprocessed_articles
train_df['pos_tags_articles'] = pos_tags_final_articles
train_df['er_tags_articles'] = er_final_articles

In [109]:
train_df.head()

Unnamed: 0,id,headlines,articles,preprocessed_headlines,pos_tags_headlines,er_tags_headlines,preprocessed_articles,pos_tags_articles,er_tags_articles
0,3121,The Pope Says 'Humanity Must Repent For Abusin...,We humans “must repent and modify our lifestyl...,the pope say humanity must repent for abuse mo...,"[(the, DT), (pope, NN), (say, VBP), (humanity,...",[],we human must repent and modify our lifestyle ...,"[(we, PRP), (human, NN), (must, MD), (repent, ...","[((thursday), DATE, 391), ((the, world, day), ..."
1,3135,Russian allies are unknowingly working against...,The Secretary of State for Defence described P...,russian ally be unknowingly work against they ...,"[(russian, JJ), (ally, NN), (be, VB), (unknowi...","[((russian), NORP, 381)]",the secretary of state for defence describe pu...,"[(the, DT), (secretary, NNP), (of, IN), (state...","[((putin), PERSON, 380), ((russian), NORP, 381..."
2,3134,"I know this week has been disruptive, says LIZ...",The United Kingdom is the greatest country on ...,know this week have be disruptive say liz truss,"[(know, VB), (this, DT), (week, NN), (have, VB...","[((this, week), DATE, 391)]",the united kingdom be the great country on ear...,"[(the, DT), (united, NNP), (kingdom, NNP), (be...","[((the, united, kingdom), GPE, 384), ((70, yea..."
3,3120,Vladimir Putin facing Ukraine 'humiliation' as...,"VLADIMIR PUTIN is facing ""humiliation"" in Ukra...",vladimir putin face ukraine humiliation as rus...,"[(vladimir, NNP), (putin, NNP), (face, NNP), (...","[((vladimir, putin), PERSON, 380), ((ukraine),...",vladimir putin be face humiliation in ukraine ...,"[(vladimir, NNP), (putin, NNP), (be, VB), (fac...","[((vladimir, putin), PERSON, 380), ((ukraine),..."
4,3136,What Are the Odds Putin is Bluffing About Usin...,Eurointelligence founder Wolfgang Münchau has ...,what be the odd putin be bluff about use nucle...,"[(what, WP), (be, VB), (the, DT), (odd, JJ), (...",[],eurointelligence founder wolfgang münchau have...,"[(eurointelligence, NN), (founder, NN), (wolfg...","[((wolfgang, münchau), PERSON, 380), ((putin),..."


In [110]:
# converting tweets_df into a csv file
filename = '/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/preprocessed_data/prev_data/en_train_subtask_1.csv'
train_df.to_csv(filename, index=True)