#### **This script checks for the language use for toxicity API**

In [1]:
import pandas as pd

In [2]:
path = './../data/Gaza_twitter_combined.pkl.gz'

In [3]:
df = pd.read_pickle(path)

In [4]:
df.columns

Index(['created_at', 'follower_count', 'following_count', 'id', 'imageUrls',
       'lang', 'like_count', 'linked_tweet', 'retweet_count', 'screen_name',
       'text', 'tweet_type', 'urls', 'author', 'id.1', 'date'],
      dtype='object')

In [64]:
len(df)

4570627

In [65]:
df['id'].nunique()

4570627

#### **Type of posts and number of posts**

In [6]:
df['tweet_type'].unique()

array(['retweet', 'tweet', 'reply', 'quote'], dtype=object)

In [7]:
rt = len(df.loc[df['tweet_type'] == 'retweet'])
twt = len(df.loc[df['tweet_type'] == 'tweet'])
reply= len(df.loc[df['tweet_type'] == 'reply'])
quote = len(df.loc[df['tweet_type'] == 'quote'])
total = len(df)

print('Rtwt: ', rt/total)
print('Tweet: ', twt/total)
print('Reply: ', reply/total)
print('Quote :', quote/total)

Rtwt:  0.8777856954855427
Tweet:  0.047288916816007956
Reply:  0.061388732880631035
Quote : 0.013536654817818211


#### **Retweet**

In [8]:
df_rt = df.loc[~(df['linked_tweet'].isnull()) & (df['tweet_type'] == 'retweet')]

print(df_rt['linked_tweet'].nunique())

264139


In [9]:
df_rt_text = df_rt.groupby('linked_tweet').first().reset_index()

In [10]:
len(df_rt_text)

264139

In [11]:
df_lang_count = (df_rt_text
                 .groupby('lang')['id']
                 .nunique()
                 .to_frame('count')
                 .reset_index()
                 .sort_values(by='count',
                              ascending=False
                             )
                )

In [12]:
df_lang_count.loc[df_lang_count['count'] >= 100]

Unnamed: 0,lang,count
12,English,228951
1,Arabic,33315
50,Unknown,258
21,Indonesian,226
15,French,201
18,Hebrew,192
43,Spanish,163
24,Japanese,130


In [13]:
df_rt_text['lang'].unique()

array(['English', 'Arabic', 'Hebrew', 'French', 'Catalan', 'Spanish',
       'Chinese', 'Urdu', 'Hindi', 'Russian', 'Italian', 'Unknown',
       'Thai', 'Persian', 'Swedish', 'Dutch', 'Turkish', 'Portuguese',
       'Indonesian', 'Danish', 'Romanian', 'Bulgarian', 'German',
       'Tagalog', 'Marathi', 'Malagasy', 'Japanese', 'Haitian', 'Basque',
       'Polish', 'Estonian', 'Czech', 'Slovak', 'Latvian',
       'Modern Greek (1453-)', 'Tamil', 'Malay (Macrolanguage)', 'Uzbek',
       'Finnish', 'Slovenian', 'Afrikaans', 'Hungarian', 'Welsh',
       'Ukrainian', 'Pushto', 'Kinyarwanda', 'Vietnamese', 'Croatian',
       'Serbian', 'Norwegian', 'Lithuanian', 'Burmese', 'Irish',
       'Bihari Languages', 'Korean'], dtype=object)

#### **Original tweets**

In [14]:
df_org = df.loc[df['tweet_type'] != 'retweet']

In [15]:
df_org_lang_count = (df_org
                 .groupby('lang')['id']
                 .nunique()
                 .to_frame('count')
                 .reset_index()
                 .sort_values(by='count',
                              ascending=False
                             )
                )

In [16]:
df_org_lang_count.loc[df_org_lang_count['count'] >= 100]

Unnamed: 0,lang,count
16,English,511285
2,Arabic,36640
65,Unknown,3554
27,Indonesian,1024
19,French,775
23,Hebrew,726
56,Spanish,469
51,Russian,431
32,Kinyarwanda,371
30,Japanese,368


In [17]:
df_org.loc[df_org['lang'] == 'Hebrew']['text']

41         @Mistaclim רחוב שבטי ישראל ירושלים המזרחית כמו...
297        י"ד אלול התשפ"ג | אזור ירושלים 'צ'ולנט על הגג'...
1452       מעביר אני את ענייני מפני בית אמה של אסתר משרתת...
2612       חיפה ניצבת מול שני איומים גדולים שיחרצו את גור...
2677       בעזה יש דיבורים על להצית מחר את גבולות ישראל ע...
                                 ...                        
4176021                     @TheRealYield ווחאד סדקים בתמיכה
4187939    🔹ישראל תושמד על ידי חמושים פלסטינים 🔹זכור את ה...
4191131    طفلة8شهورأحدأهداف الاحتلال الإسرائيلي An eight...
4213880    @IsraelPersian מאַרטירדאָם אין די וועג פון געט...
4511204    הצבא הישראלי הפסיד במלחמה. אין סיבה אחרת למה ה...
Name: text, Length: 726, dtype: object

#### **Languages covered by Perspective API**

In [18]:
languages = ['Arabic', #
             'Chinese', #
             'Czech', 
             'Dutch', 
             'English', 
             'French', 
             'German', 
             'Hindi',
             'Hinglish',
             'Indonesian', 
             'Italian', 
             'Japanese', 
             'Korean', 
             'Polish', 
             'Portuguese', 
             'Russian', 
             'Spanish', 
             'Swedish'
            ]

In [19]:
not_av = set(df_org_lang_count['lang'].head(10)) - set(languages)

In [20]:
not_av

{'Hebrew', 'Kinyarwanda', 'Unknown'}

In [21]:
not_rt_av = set(df_lang_count['lang'].head(10)) - set(languages)
not_rt_av

{'Hebrew', 'Unknown'}

In [22]:
# df_org['text']

In [23]:
df.columns

Index(['created_at', 'follower_count', 'following_count', 'id', 'imageUrls',
       'lang', 'like_count', 'linked_tweet', 'retweet_count', 'screen_name',
       'text', 'tweet_type', 'urls', 'author', 'id.1', 'date'],
      dtype='object')

In [24]:
# df.loc[~df['urls'].isnull()]['urls'].tail(50)

In [25]:
df_rt_text.loc[df_rt_text['lang'] == 'Unknown']['text']

18413                 RT @redsteeze https://t.co/xyXoCsVqmr
18700     RT @v1rtu4lm3me5 @LauraLoomer https://t.co/VMJ...
18908               RT @toddeherman https://t.co/5KIkdvKjpE
18961     RT @EmailWmSmith @SenWarren https://t.co/96lDY...
19013     RT @v1rtu4lm3me5 @dom_lucre https://t.co/VMJmU...
                                ...                        
262919               RT @twetienne7 https://t.co/w5pMh5N3Fz
263340    RT @BABDCATHA2 @MikeCarlton01 https://t.co/lNL...
263353    RT @Marwa__Osman @elonmusk https://t.co/1hljoa...
263702           RT @MarieColemanAO https://t.co/gWddWJe5EJ
263827    RT @Blxdez_ @Usouph @syyruhhh https://t.co/taa...
Name: text, Length: 258, dtype: object

In [44]:
def test_toxicity(text):
    from googleapiclient import discovery
    import json
    
    API_KEY = 'AIzaSyC39cC5OxW-RxgsDjUck8Zbj3efWFf0UUw'
    
    client = discovery.build(
      "commentanalyzer",
      "v1alpha1",
      developerKey=API_KEY,
      discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
      static_discovery=False,
    )
    
    analyze_request = {
      'comment': { 
          'text': f'{text}'},
      'requestedAttributes': {'TOXICITY': {},
                              'SEVERE_TOXICITY': {},
                              'IDENTITY_ATTACK': {},
                              'INSULT': {},
                              'PROFANITY': {},
                              'THREAT': {}
                             }
    }
    
    response = client.comments().analyze(body=analyze_request).execute()

    return response

#### **All data to be considered**

In [27]:
df['tweet_type'].unique()

array(['retweet', 'tweet', 'reply', 'quote'], dtype=object)

In [59]:
df.columns

Index(['created_at', 'follower_count', 'following_count', 'id', 'imageUrls',
       'lang', 'like_count', 'linked_tweet', 'retweet_count', 'screen_name',
       'text', 'tweet_type', 'urls', 'author', 'id.1', 'date'],
      dtype='object')

In [61]:
df.loc[df['id.1'] == '0004e3f6c40ec8f3d13a2311481c1e5f']

Unnamed: 0,created_at,follower_count,following_count,id,imageUrls,lang,like_count,linked_tweet,retweet_count,screen_name,text,tweet_type,urls,author,id.1,date


In [28]:
df_rt = df.loc[df['tweet_type'] == 'retweet']
df_rt_org = df_rt.groupby('linked_tweet').first().reset_index()
df_org = df.loc[(df['tweet_type'] != 'retweet') & (df['lang'] != 'Unknown')]
df_need = pd.concat([df_rt_org, df_org],
                    ignore_index=True
                   )

In [55]:
len(df_rt_org)

264139

In [57]:
df_rt_org.to_pickle('./data/posts/retweet_original.pkl.gz')

In [29]:
import time

In [30]:
df_need['lang'].unique()

array(['English', 'Arabic', 'Hebrew', 'French', 'Catalan', 'Spanish',
       'Chinese', 'Urdu', 'Hindi', 'Russian', 'Italian', 'Unknown',
       'Thai', 'Persian', 'Swedish', 'Dutch', 'Turkish', 'Portuguese',
       'Indonesian', 'Danish', 'Romanian', 'Bulgarian', 'German',
       'Tagalog', 'Marathi', 'Malagasy', 'Japanese', 'Haitian', 'Basque',
       'Polish', 'Estonian', 'Czech', 'Slovak', 'Latvian',
       'Modern Greek (1453-)', 'Tamil', 'Malay (Macrolanguage)', 'Uzbek',
       'Finnish', 'Slovenian', 'Afrikaans', 'Hungarian', 'Welsh',
       'Ukrainian', 'Pushto', 'Kinyarwanda', 'Vietnamese', 'Croatian',
       'Serbian', 'Norwegian', 'Lithuanian', 'Burmese', 'Irish',
       'Bihari Languages', 'Korean', 'Swahili (Macrolanguage)', 'Kannada',
       'Bengali', 'Malayalam', 'Nyanja', 'Telugu',
       'Nepali (Macrolanguage)', 'Gujarati', 'Icelandic', 'Panjabi',
       'Southern Sotho', 'Bosnian', 'Azerbaijani', 'Albanian', 'Maltese'],
      dtype=object)

In [53]:
len(df_need)

819181

In [52]:
# df_test = df_need.loc[df_need['lang'] == 'Korean']

In [70]:
df_need = pd.read_pickle(
    './data/posts/retweet_original.pkl.gz'
)

In [71]:
len(df_need)

45605

In [68]:
df_need.columns

Index(['linked_tweet', 'data_annotations', 'user_id', 'creation_date',
       'data_follower_count', 'data_following_count', 'data_id',
       'data_imageUrls', 'data_lang', 'data_like_count', 'mentioned_users',
       'data_name', 'data_retweet_count', 'data_screen_name', 'text',
       'data_translatedContentText', 'tweet_id', 'tweet_type', 'urls',
       'linked_tweet_user_id', 'hashtags'],
      dtype='object')

In [69]:
df_need['data_lang'].unique()

array(['English', 'Arabic', 'French', 'Catalan', 'Spanish', 'Russian',
       'Italian', 'Unknown', 'Persian', 'Swedish', 'Turkish',
       'Portuguese', 'Indonesian', 'Urdu', 'Japanese', 'Hindi', 'Danish',
       'Uzbek', 'Hebrew', 'German', 'Kinyarwanda', 'Tagalog', 'Malagasy',
       'Finnish', 'Chinese', 'Romanian', 'Tamil', 'Irish'], dtype=object)

In [54]:
all_row = []
for index, row in df_need.iterrows():
    try:
        score = test_toxicity(row['text'])
        score = score['attributeScores']
        
        row['INSULT'] = score['INSULT']['summaryScore']['value']
        row['THREAT'] = score['THREAT']['summaryScore']['value']
        row['TOXICITY'] = score['TOXICITY']['summaryScore']['value']
        row['SEVERE_TOXICITY'] = score['SEVERE_TOXICITY']['summaryScore']['value']
        row['PROFANITY'] = score['PROFANITY']['summaryScore']['value']
        row['IDENTITY_ATTACK'] = score['IDENTITY_ATTACK']['summaryScore']['value']
    except Exception as e:
        print(e)
        row['INSULT'] = None
        row['THREAT'] = None
        row['TOXICITY'] = None
        row['SEVERE_TOXICITY'] = None
        row['PROFANITY'] = None
        row['IDENTITY_ATTACK'] = None

    all_row.append(row)
    
    time.sleep(1)

<HttpError 400 when requesting https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key=AIzaSyC39cC5OxW-RxgsDjUck8Zbj3efWFf0UUw&alt=json returned "Attribute THREAT does not support request languages: ga". Details: "[{'@type': 'type.googleapis.com/google.commentanalyzer.v1alpha1.Error', 'errorType': 'LANGUAGE_NOT_SUPPORTED_BY_ATTRIBUTE', 'languageNotSupportedByAttributeError': {'detectedLanguages': ['ga'], 'attribute': 'THREAT'}}]">
<HttpError 400 when requesting https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key=AIzaSyC39cC5OxW-RxgsDjUck8Zbj3efWFf0UUw&alt=json returned "Attribute TOXICITY does not support request languages: ilo". Details: "[{'@type': 'type.googleapis.com/google.commentanalyzer.v1alpha1.Error', 'errorType': 'LANGUAGE_NOT_SUPPORTED_BY_ATTRIBUTE', 'languageNotSupportedByAttributeError': {'detectedLanguages': ['ilo'], 'attribute': 'TOXICITY'}}]">


KeyboardInterrupt: 

In [None]:
len(all_row)

In [None]:
df_scores = pd.DataFrame(all_row)