## Content Analysis of tweets
In this notebook, we process the annotations of sampled tweets to find the distribution of different types of content among COVID-19 related Persian tweets. We're annotating more tweets and these are some preliminary results.

In [20]:
import numpy as np
import pandas as pd

from sklearn.metrics import cohen_kappa_score

In [2]:
root = "data/"
data = pd.read_excel(root + "doc_clusters.xlsx")

In [3]:
total_tweets_count = 45234
n_cluster = len(data.cluster.value_counts())
cluster_w = {}
for index, value in data.cluster.value_counts().items():
    cluster_w[index] = (value / total_tweets_count)
print(cluster_w)

{4: 0.46084803466419066, 7: 0.15183269222266438, 2: 0.09875315028518371, 6: 0.08776583985497635, 3: 0.08734580183048149, 1: 0.0387098200468674, 5: 0.037936065791219, 0: 0.03680859530441703}


In [4]:
# reading annotations
ann_1 = pd.read_excel(root + "annotation/ann1.xlsx")
ann_2 = pd.read_excel(root + "annotation/ann2.xlsx")

# normalizing labels
ann_2.loc[ann_2['label'] == 'blame', 'label'] = "complaint"
ann_1.loc[ann_1['label'] == 'sarcasm', 'label'] = "satire"
ann_1.loc[ann_1['label'] == 'report', 'label'] = "news"
ann_1.loc[ann_1['label'] == 'rumor', 'label'] = "news"

print(ann_1["label"].unique())
print(ann_2["label"].unique())

# generating merged annotation file
dis = pd.concat([ann_1["doc"], ann_1["label"], ann_2["label"]], axis=1)
dis.to_excel("data/dis.xlsx")

['news' 'neutral' 'satire' 'opinion' 'complaint' 'solution']
['solution' 'opinion' 'satire' 'news' 'complaint' 'neutral']


### Inter-Annotator agreement

In [11]:
ann_data = pd.read_excel(root + "annotation/final.xlsx")

ann_1 = ann_data['ann1']
ann_2 = ann_data['ann2']
ann_final = ann_data['final']

assert len(ann_1) == len(ann_2)

# Cohen Kappa
print("agreement: {}".format(cohen_kappa_score(ann_1, ann_2)))

agreement: 0.4662713120830245


### Category distribution

In [23]:
cluster_size = 30

final_w = {"satire": 0, "news": 0, "opinion": 0, "complaint": 0, "solution": 0, "neutral": 0}
labels = {"satire": 0, "news": 0, "opinion": 0, "complaint": 0, "solution": 0, "neutral": 0}

for i in range(len(ann_final)):
    if i % cluster_size != 0:
        # if there was a disagreement, final is not empty
        if str(ann_final.iloc[i]) != "nan":
            labels[ann_final.iloc[i]] += 1
        else:
            # if there's an agreement, no difference between ann_1 and ann_2
            labels[ann_1.iloc[i]] += 1
    else:
        for i in range(n_cluster):
            for key, value in labels.items():
                final_w[key] += (value / cluster_size) * cluster_w[i]
        
        # set for the new cluster
        labels = {"satire": 0, "news": 0, "opinion": 0, "complaint": 0, "solution": 0, "neutral": 0}

In [24]:
final_w

{'satire': 1.9,
 'news': 1.5999999999999996,
 'opinion': 1.2333333333333332,
 'complaint': 1.1666666666666665,
 'solution': 0.49999999999999994,
 'neutral': 0.36666666666666664}

In [32]:
from math import pi

import pandas as pd

from bokeh.io import output_file, show
from bokeh.palettes import Category20c
from bokeh.plotting import figure
from bokeh.transform import cumsum

#output_file("pie.html")

data = pd.Series(final_w).reset_index(name='value').rename(columns={'index':'category'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = Category20c[len(final_w)]

p = figure(plot_height=350, title="Pie Chart", toolbar_location=None,
           tools="hover", tooltips="@category: @value", x_range=(-0.5, 1.0))

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend_field='category', source=data)

p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None

show(p)