## Content Analysis of tweets
In this notebook, we process the annotations of sampled tweets to find the distribution of different types of content among COVID-19 related Persian tweets. We're annotating more tweets and these are some preliminary results.

In [1]:
import pandas as pd

In [2]:
root = "data/"
data = pd.read_excel(root + "doc_clusters.xlsx")

In [3]:
n_cluster = len(data.cluster.value_counts())
cluster_w = {}
for index, value in data.cluster.value_counts().items():
    cluster_w[index] = (value / 45234)
print(cluster_w)

{4: 0.46084803466419066, 7: 0.15183269222266438, 2: 0.09875315028518371, 6: 0.08776583985497635, 3: 0.08734580183048149, 1: 0.0387098200468674, 5: 0.037936065791219, 0: 0.03680859530441703}


In [4]:
ann_1 = pd.read_excel(root + "annotation/ann1.xlsx")
ann_2 = pd.read_excel(root + "annotation/ann2.xlsx")

In [5]:
# normalizing labels
ann_2.loc[ann_2['label'] == 'blame', 'label'] = "complaint"
ann_1.loc[ann_1['label'] == 'sarcasm', 'label'] = "satire"
ann_1.loc[ann_1['label'] == 'report', 'label'] = "news"

### Inter-Annotator agreement

In [12]:
assert len(ann_1) == len(ann_2)
inter_df = pd.concat([ann_1['label'], ann_2['label']], axis=1, sort=False)
agreement = 0
for index, row in inter_df.iterrows():
    if row[0] == row[1]:
        agreement += 1
print("Inter-rater agreement: {}".format(agreement/len(ann_1)))

Inter-rater agreement: 0.575


In [13]:
inter_df

Unnamed: 0,label,label.1
0,news,solution
1,neutral,opinion
2,satire,satire
3,satire,satire
4,rumor,news
...,...,...
235,opinion,news
236,news,news
237,opinion,complaint
238,complaint,complaint


### Category distribution

In [18]:
final_w = {"satire": 0, "news": 0, "opinion": 0, "complaint": 0, "solution": 0, "other": 0}
for i in range(n_cluster):
    labels = {"satire": 0, "news": 0, "opinion": 0, "complaint": 0, "solution": 0, "other": 0}
    # annotator #1
    for index, row in ann_1.iterrows():
        if row["label"] in labels:
            labels[row["label"]] += 1
        else:
            labels["other"] += 1

    # annotator #2
    for index, row in ann_2.iterrows():
        if row["label"] in labels:
            labels[row["label"]] += 1
        else:
            labels["other"] += 1

    for key, value in labels.items():
        final_w[key] += (value / 60) * cluster_w[i]

In [19]:
final_w

{'satire': 1.7166666666666666,
 'news': 2.1666666666666665,
 'opinion': 2.35,
 'complaint': 0.8833333333333333,
 'solution': 0.5833333333333334,
 'other': 0.30000000000000004}