# Prepare manual eval

In [21]:
import pandas as pd

In [22]:
# read pandas dataframe from pickle file
df = pd.read_pickle('../data/user_classification/user_age_gender_location_manual_eval.pkl')
print(df.shape)
print(df.columns)

(1200, 37)
Index(['tweet_id', 'user_id', 'created_at', 'tweet', 'status', 'regex_type',
       'regex_idx', 'age_raw', 'year_tweet', 'age_in_2023', 'age_when_tweeted',
       'username', 'full_name', 'location', 'join_year', 'join_month', 'bio',
       'join_day', 'tweets', 'following', 'followers', 'likes', 'loc_count',
       'location_clean', 'foreign_country', 'all_regions', 'region_pos',
       'region', 'term_for_italy', 'name_city_engl', 'condition', 'city_id',
       'all_cities', 'city_pos', 'region_code', 'is_male', 'user_has_image'],
      dtype='object')


In [23]:
# drop all columns but "full name" and "is_male"
dfp = df.drop([c for c in df.columns if (c != "full_name" and c != "is_male" and c != "user_id" and c != "username")], axis=1)
# sort by is_male
dfp.sort_values('is_male', inplace=True)

In [24]:
# save to excel file
dfp.to_excel('../data/user_classification/gender_manual_eval_v2.xlsx', index=False)

# Evaluate Gender Labels

In [33]:
import pandas as pd

pred_col = "is_male"
annot1_col = "lore"
annot2_col = "mari"
annot3_col = "ema"

dfgen = pd.read_excel("../data/user_classification/manual_eval_gender.xlsx")
dfgen[pred_col] = dfgen[pred_col].astype(int).astype(str)
dfgen[annot1_col] = dfgen[annot1_col].fillna(dfgen[pred_col]).astype(str)
dfgen[annot2_col] = dfgen[annot2_col].fillna(dfgen[pred_col]).astype(str)
dfgen[annot3_col] = dfgen[annot3_col].fillna(dfgen[pred_col]).astype(str)

In [38]:
print("Raw agreement score between annotators:")
ag12 = sum(dfgen[annot1_col] == dfgen[annot2_col]) / len(dfgen)
ag13 = sum(dfgen[annot1_col] == dfgen[annot3_col]) / len(dfgen)
ag23 = sum(dfgen[annot2_col] == dfgen[annot3_col]) / len(dfgen)
print("Annotator 1 vs. Annotator 2: ", ag12)
print("Annotator 1 vs. Annotator 3: ", ag13)
print("Annotator 2 vs. Annotator 3: ", ag23)

ag_avg = (ag12 + ag13 + ag23) / 3
print("Average agreement score: ", ag_avg)
print("\n")

print("Raw agreement score between the automatic labelling and each annotator:")
ag1p = sum(dfgen[pred_col] == dfgen[annot1_col]) / len(dfgen)
ag2p = sum(dfgen[pred_col] == dfgen[annot2_col]) / len(dfgen)
ag3p = sum(dfgen[pred_col] == dfgen[annot3_col]) / len(dfgen)
print("Automatic vs. Annotator 1: ", ag1p)
print("Automatic vs. Annotator 2: ", ag2p)
print("Automatic vs. Annotator 3: ", ag3p)

ag_avg = (ag1p + ag2p + ag3p) / 3
print("Average agreement score: ", ag_avg)


Raw agreement score between annotators:
Annotator 1 vs. Annotator 2:  0.9991666666666666
Annotator 1 vs. Annotator 3:  1.0
Annotator 2 vs. Annotator 3:  0.9991666666666666
Average agreement score:  0.9994444444444444


Raw agreement score between the automatic labelling and each annotator:
Automatic vs. Annotator 1:  0.9941666666666666
Automatic vs. Annotator 2:  0.9933333333333333
Automatic vs. Annotator 3:  0.9941666666666666
Average agreement score:  0.9938888888888888


In [31]:


# compute Cohen's k agreement score between annotators
from sklearn.metrics import cohen_kappa_score

print("Cohen's k agreement score between annotators:")
print("annot1 and annot2:", cohen_kappa_score(dfgen[annot1_col], dfgen[annot2_col]))
print("annot1 and annot3:", cohen_kappa_score(dfgen[annot1_col], dfgen[annot3_col]))
print("annot2 and annot3:", cohen_kappa_score(dfgen[annot2_col], dfgen[annot3_col]))

print("AVERAGE:", (cohen_kappa_score(dfgen[annot1_col], dfgen[annot2_col]) + cohen_kappa_score(dfgen[annot1_col], dfgen[annot3_col]) + cohen_kappa_score(dfgen[annot2_col], dfgen[annot3_col])) / 3)



Cohen's k agreement score between annotators:
annot1 and annot2: 0.9982112272322766
annot1 and annot3: 1.0
annot2 and annot3: 0.9982112272322766
AVERAGE: 0.9988074848215177


In [32]:
# compute precision, recall and f1-score of predictions against each annotator, and their average
from sklearn.metrics import classification_report

print("Classification report of predictions against annotators:")
print("annot1:")
print(classification_report(dfgen[pred_col], dfgen[annot1_col]))
print("annot2:")
print(classification_report(dfgen[pred_col], dfgen[annot2_col]))
print("annot3:")
print(classification_report(dfgen[pred_col], dfgen[annot3_col]))

print("AVERAGE:")
print(classification_report(dfgen[pred_col], dfgen[annot1_col], output_dict=True)["macro avg"])

Classification report of predictions against annotators:
annot1:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       430
           1       1.00      0.99      1.00       770
         unk       0.00      0.00      0.00         0

    accuracy                           0.99      1200
   macro avg       0.67      0.66      0.66      1200
weighted avg       1.00      0.99      1.00      1200

annot2:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       430
           1       1.00      0.99      0.99       770
         unk       0.00      0.00      0.00         0

    accuracy                           0.99      1200
   macro avg       0.66      0.66      0.66      1200
weighted avg       1.00      0.99      1.00      1200

annot3:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       430
           1       1.00      0.99      1.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Location

In [40]:
df = pd.read_pickle("../data/user_classification/user_age_gender_location.pkl")

In [139]:
living_abroad = (df.foreign_country == '1').sum()
living_in_italy = (df.foreign_country == '0').sum()
mentioning_location = df[df.location!=''].location.shape[0]
print(f"Share users living in Foreign Country: {living_abroad / len(df) * 100:.3f}%")
print(f"Share users living in Foreign Country over total mentions: {living_abroad / mentioning_location * 100:.3f}%")

Share living in Foreign Country: 2.599%
Share living in Foreign Country over total mentions: 4.805%


In [153]:
from langdetect import detect

def detect_language(text):
    try:
        language = detect(text)
    except:
        language = "error"
        print("This row throws and error:", text)
    return language

# detect language of each tweet
df['lang'] = df['tweet'].apply(lambda x: detect_language(str(x)))
# the tweet throws an error when trying to detect the language is italian, so we manually set it to italian
df.loc[df.lang == 'error', 'lang'] = 'it'


This row throws and error: Oggi compio 21 anni ! ❤️💙😘😍💛💜💚😋✌️🐯✌️✌️😋💚💜💜😍😘❤️❤️❤️😍💛💪👏💥👨


In [158]:
# count share of tweets in italian
print(f"Share of tweets in italian: {df[df.lang=='it'].shape[0] / df.shape[0] * 100:.3f}%")
# count share of users who posted at least a tweet in italian
print(f"Share of users who posted at least a tweet in italian: {df[df.lang=='it'].user_id.nunique() / df.user_id.nunique() * 100:.3f}%")

Share of tweets in italian: 96.698%
Share of users who posted at least a tweet in italian: 96.698%


In [127]:
# print all value counts of location
pd.set_option('display.max_rows', None)
df[df.location!=''].location.value_counts().sum()

10927