In [2]:
import pandas as pd
import os

### Parsing conversations

In [73]:
all_columns=['conversation_id', 'id','author_id','text','in_reply_to_user_id']
for file in os.listdir("conversations"):
    if file.split(".")[1]=="json":
        print("--------------Processing file {0}--------------".format(file))
        df=pd.read_json(os.path.join("conversations",file),lines=True)
        all_tweets=[]
        i=100
        for line in df['data']:
            for tweet in line:
                tweet_info=[]

                for column in all_columns:
                    tweet_info.append(tweet[column])

                if tweet.get('referenced_tweets','')!='':
                    if tweet['referenced_tweets'][0]['type']=='replied_to':
                        tweet_info.append(tweet['referenced_tweets'][0]['id'])
                    else:
                        tweet_info.append('')
                else:
                    tweet_info.append('')
                
                if tweet.get('attachments','')!='':
                    tweet_info.append('1')
                else:
                    tweet_info.append('0')

                all_tweets.append(tweet_info)

            if len(all_tweets)>i:
                print("--------------Processed more than {0} tweets--------------".format(i))
                i=i*10

        for line in df['includes']:
            for tweet in line['tweets']:
                tweet_info=[]

                for column in all_columns:
                    tweet_info.append(tweet.get(column,''))

                if tweet.get('referenced_tweets','')!='':
                    if tweet['referenced_tweets'][0]['type']=='replied_to':
                        tweet_info.append(tweet['referenced_tweets'][0]['id'])
                    else:
                        tweet_info.append('')
                else:
                    tweet_info.append('')
                
                if tweet.get('attachments','')!='':
                    tweet_info.append('1')
                else:
                    tweet_info.append('0')
                
                all_tweets.append(tweet_info)

            if len(all_tweets)>i:
                print("--------------Processed more than {0} tweets--------------".format(i))
                i=i*10

        df_out = pd.DataFrame(all_tweets,columns=all_columns+['replied_to', 'attachment'])
        df_out = df_out.drop_duplicates().copy()
        df_out.to_csv("Parsed_conversations/"+file.split(".")[0]+".csv", index=False)

--------------Processing file Australia_ids.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processing file Britain_ids.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 tweets--------------
--------------Processing file Canada_ids.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 tweets--------------
--------------Processing file ICRC_library.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processing file IFRC_Europe.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 10

### Parsing users info

In [21]:
df=pd.read_json("users.jsonl",lines=True)
all_users=[]
public_columns=['followers_count', 'following_count','tweet_count']
additional_columns=['id','name', 'username', 'description','created_at']
for line in df['data']:
    for user in line:

        user_info=[]        
        for column in public_columns:
            try:
                user_info.append(user['public_metrics'][column])
            except:
                print(user)
        for column in additional_columns:
            user_info.append(user[column])

        all_users.append(user_info)

df_out = pd.DataFrame(all_users,columns=public_columns+additional_columns)
df_out = df_out.drop_duplicates().copy()
df_out.to_csv("labeled/users.csv", index=False)

### Concating old and new files

In [89]:
ivan_dict={"nepal_ids":"NepalRedCross","Nairobi_ids":"ICRC_Nairobi","Canada_ids":"redcrosscanada","Britain_ids":"BritishRedCross","Australia_ids":"RedCrossAU","Ireland_ids":"irishredcross"}

In [90]:
NGO_list = ["Australia_ids", "Britain_ids", "Canada_ids", "Ireland_ids", "Nairobi_ids", "nepal_ids", "RedCrossLebanon", "RedCross", "philredcross", "IFRC_Europe", "ICRC_library"]
files=os.listdir("parsed_conversations")
for ngo in NGO_list:
    group=[file for file in files if ngo in file]
    if 'RedCross.csv' in group:
        group=[element for element in group if 'Lebanon' not in element]
    for ind,element in enumerate(group):
        if ind==0:
            df=pd.read_csv("parsed_conversations/"+element, dtype='str')
        else:
            df1=pd.read_csv("parsed_conversations/"+element, dtype='str')
            df=pd.concat([df,df1]).copy()
    df.to_csv("parsed_conversations/"+ivan_dict.get(ngo,ngo)+"_all.csv",index=False)
        

## Label the data:

In [3]:
#Dictionary to store NGOs ids
ICRC_dict={"RedCrossAU":"138418895",
        "BritishRedCross":"7400692",
        "philredcross":"32391821",
        "redcrosscanada":"16434613",
        "ICRC_Nairobi":"1151791608494473219",
        "NepalRedCross":"566269074",
        "RedCrossLebanon":"2548704956",
        "ICRC_library":"2548704956",
        "RedCross":"6519522",
        "irishredcross":"25411906",
        "IFRC_Europe":"937963107456036865"}

AdditionalNGO_dict = {"Ch_JesusChrist":"10047382",
        "ymca":"309679834",
        "SalvationArmyUS":"16729099",
        "boyscouts":"20685982",
        "girlscouts":"103018203",
        "Habitat_org":"33898911",
        "SavetheChildren":"14055301",
        "WorldVision":"14086764",
        "RESCUEorg":"22053725",
        "CatholicRelief":"14496886"}

In [None]:
def do_labeling(org_ids):

    #DataFrame to store all labeled replies related to our task
    all_replies = pd.DataFrame()

    for file in os.listdir('parsed_conversations'):
        if file.split('.')[1]=='csv': #set '_all. for ICRC'
            NGO_name = file.split('.')[0] #set '_all.' for ICRC
            NGO_id = org_ids[NGO_name]

            #added terminator to fix tokenizer error for some files
            df = pd.read_csv("parsed_conversations/{0}".format(file), lineterminator='\n', dtype='str')

            #get the tweets on which users got replies
            ngo_replied_to_ids = df[(df["author_id"] == NGO_id) & (df["in_reply_to_user_id"] != NGO_id)]["replied_to"].dropna()
            ngo_replied_to = df[df['id'].isin(ngo_replied_to_ids)].dropna()
            ngo_replied_to["label"] = 1

            #get the tweets on which ngo didn't reply
            replies_to_ngo = df[(df["in_reply_to_user_id"]==NGO_id) & (df["author_id"]!=NGO_id)]
            replies_to_ngo = replies_to_ngo[~replies_to_ngo['id'].isin(ngo_replied_to_ids)].dropna()
            replies_to_ngo["label"] = 0

            #concatenate these tweets together
            all_ngo_replies = pd.concat([replies_to_ngo, ngo_replied_to]).reset_index(drop=True)
            all_ngo_replies["relatedNGO"] = NGO_name
            
            #add current ngo data to global df
            all_replies = pd.concat([all_replies, all_ngo_replies]).reset_index(drop=True)

            print("processed file: {0}".format(file))
            
    all_replies.to_csv("labeled/labeled_data.csv")

In [None]:
#Label ICRC
do_labeling(ICRC_dict)

## Label the data(Additional NGO):

In [None]:
#Label additional
do_labeling(AdditionalNGO_dict)