# Merge Annotation data

This notebook merges in the AWS SageMaker Ground truth responses into the master.csv data set. 

In [1]:
import os
import csv
import json
import pandas as pd
from pprint import pprint

text_df = pd.read_csv("../data/text.csv", header=None)
text_df.columns = ["text"]

### First, merge in the opinion-results

**Use compare_texts to verify row order**

In [2]:
import re

def compare_texts(og_tweet, annot_tweet):
    match = True
    for i in range(len(og_tweet)):
        if re.sub(r"[^a-zA-Z0-9 ]", "", og_tweet[i]) != re.sub(r"[^a-zA-Z0-9 ]", "", annot_tweet[i]):
            print(f"Texts do not match at index {i}")
            match = False
                  
    if match:
        print("All tweet texts match. Rows and columns are aligned!")
        
    return match
                  

**Merge in SageMaker opinion annotation results**

The `opinion-results.txt` file is the SageMaker identified ground truth results for tweet opinion. It includes the opinion key, opinion label, and a algorithmically assigned confidence score

In [3]:
with open("opinion-results.txt") as opinion_file:
    rows = opinion_file.readlines()

    annotation_texts = [json.loads(row.strip()).get("source") for row in rows]
    annotation_keys = [json.loads(row.strip()).get("annotate-user-opinion-2") for row in rows]
    annotation_labels = [json.loads(row.strip()).get("annotate-user-opinion-2-metadata").get("class-name") for row in rows]
    annotation_confidences = [json.loads(row.strip()).get("annotate-user-opinion-2-metadata").get("confidence") for row in rows]


text_df["opinion_text"] = annotation_texts 
text_df["opinion_key"] = annotation_keys
text_df["opinion_label"] = annotation_labels
text_df["opinion_annotation_confidence"] = annotation_confidences


matched = compare_texts(text_df["text"], text_df["opinion_text"])

if matched:
    del text_df["opinion_text"]
    
text_df.head()

All tweet texts match. Rows and columns are aligned!


Unnamed: 0,text,opinion_key,opinion_label,opinion_annotation_confidence
0,@MSNBC @MaddowBlog “Simpleton’s defense”? You...,2,AGAINST student loan forgiveness,0.7
1,@MSNBC @MaddowBlog I feel sorry for the sucker...,1,NEUTRAL support,0.62
2,@MSNBC @MaddowBlog Setting up a 2024 elections...,2,AGAINST student loan forgiveness,0.43
3,@MSNBC @MaddowBlog If you can't pay off studen...,1,NEUTRAL support,0.51
4,@MSNBC @MaddowBlog The simple defense is why s...,0,FOR student loan forgiveness,0.88


**Merge in SageMaker ego involvement annotation results**

The `ego-involvement-results.txt` file is the SageMaker identified ground truth results for tweet ego involvement. It includes the annotation key, annotation label, and a algorithmically assigned annotation confidence score

In [4]:
with open("ego-involvement-results.txt") as ego_file:
    rows = ego_file.readlines()

    annotation_texts = [json.loads(row.strip()).get("source") for row in rows]
    annotation_keys = [json.loads(row.strip()).get("annotate-ego-involvement-2") for row in rows]
    annotation_labels = [json.loads(row.strip()).get("annotate-ego-involvement-2-metadata").get("class-name") for row in rows]
    annotation_confidences = [json.loads(row.strip()).get("annotate-ego-involvement-2-metadata").get("confidence") for row in rows]


text_df["ego_involvement_text"] = annotation_texts 
text_df["ego_involvement_key"] = annotation_keys
text_df["ego_involvement_label"] = annotation_labels
text_df["ego_involvement_annotation_confidence"] = annotation_confidences


matched = compare_texts(text_df["text"], text_df["ego_involvement_text"])

if matched:
    del text_df["ego_involvement_text"]
    
text_df.head()

All tweet texts match. Rows and columns are aligned!


Unnamed: 0,text,opinion_key,opinion_label,opinion_annotation_confidence,ego_involvement_key,ego_involvement_label,ego_involvement_annotation_confidence
0,@MSNBC @MaddowBlog “Simpleton’s defense”? You...,2,AGAINST student loan forgiveness,0.7,1,Somewhat important,0.95
1,@MSNBC @MaddowBlog I feel sorry for the sucker...,1,NEUTRAL support,0.62,3,cannot judge importance,0.65
2,@MSNBC @MaddowBlog Setting up a 2024 elections...,2,AGAINST student loan forgiveness,0.43,2,Not important at all,0.81
3,@MSNBC @MaddowBlog If you can't pay off studen...,1,NEUTRAL support,0.51,3,cannot judge importance,0.53
4,@MSNBC @MaddowBlog The simple defense is why s...,0,FOR student loan forgiveness,0.88,1,Somewhat important,0.69


## Create a master_annotated file

In [5]:
master_df = pd.read_csv("../data/master.csv")

master_df.head()

Unnamed: 0,experiment_id,experiment_group,text,tweet_id,tweet_likes,retweets,tweet_created_at,user_id,in_reply_to_status_id,in_reply_to_user_id,...,description,location,followers_count,screen_name,statuses_count,favourites_count,verified,user_id_char,text_length,text_word_count
0,1,msnbc,@MSNBC @MaddowBlog “Simpleton’s defense”? You...,1.596988e+18,4,0,Sun Nov 27 22:01:59 +0000 2022,1.51875e+18,1.596987e+18,2836421,...,No name,,8,BigTex1022,2333,1941,False,1.51875e+18,183,30
1,2,msnbc,@MSNBC @MaddowBlog I feel sorry for the sucker...,1.596993e+18,0,0,Sun Nov 27 22:22:27 +0000 2022,3202809000.0,1.596987e+18,2836421,...,People following me are president Trump suppor...,"Massachusetts, USA",874,michael_favreau,30060,16373,False,3202809000.0,114,20
2,3,msnbc,@MSNBC @MaddowBlog Setting up a 2024 elections...,1.596997e+18,0,0,Sun Nov 27 22:39:00 +0000 2022,140915700.0,1.596987e+18,2836421,...,"Hamiltonian Federalist: i.e., The United State...","Washington, DC",375,AlxHamiltn,33016,1061,False,140915700.0,148,20
3,4,msnbc,@MSNBC @MaddowBlog If you can't pay off studen...,1.597006e+18,2,0,Sun Nov 27 23:13:38 +0000 2022,1933829000.0,1.596987e+18,2836421,...,Rule of law makes civilized life possible. Equ...,where ever there's oil,537,Oil_vampire,60763,19861,False,1933829000.0,226,46
4,5,msnbc,@MSNBC @MaddowBlog The simple defense is why s...,1.597023e+18,1,0,Mon Nov 28 00:21:50 +0000 2022,1.586494e+18,1.596987e+18,2836421,...,"Gay Libertarian, married, gun owner, dog daddy...",My house,5,32926Uhtred,1102,320,False,1.586494e+18,159,27


**Remove text column from text_df and merge with master_df**

In [6]:
if "text" in text_df.columns: 
    del text_df["text"]
    
master_annotated_df = master_df.join(text_df)

### Write merged dataframes
Create a new `master_annotated.csv` file for review.

In [7]:
master_annotated_df

Unnamed: 0,experiment_id,experiment_group,text,tweet_id,tweet_likes,retweets,tweet_created_at,user_id,in_reply_to_status_id,in_reply_to_user_id,...,verified,user_id_char,text_length,text_word_count,opinion_key,opinion_label,opinion_annotation_confidence,ego_involvement_key,ego_involvement_label,ego_involvement_annotation_confidence
0,1,msnbc,@MSNBC @MaddowBlog “Simpleton’s defense”? You...,1.596988e+18,4,0,Sun Nov 27 22:01:59 +0000 2022,1.518750e+18,1.596987e+18,2836421,...,False,1.518750e+18,183,30,2,AGAINST student loan forgiveness,0.70,1,Somewhat important,0.95
1,2,msnbc,@MSNBC @MaddowBlog I feel sorry for the sucker...,1.596993e+18,0,0,Sun Nov 27 22:22:27 +0000 2022,3.202809e+09,1.596987e+18,2836421,...,False,3.202809e+09,114,20,1,NEUTRAL support,0.62,3,cannot judge importance,0.65
2,3,msnbc,@MSNBC @MaddowBlog Setting up a 2024 elections...,1.596997e+18,0,0,Sun Nov 27 22:39:00 +0000 2022,1.409157e+08,1.596987e+18,2836421,...,False,1.409157e+08,148,20,2,AGAINST student loan forgiveness,0.43,2,Not important at all,0.81
3,4,msnbc,@MSNBC @MaddowBlog If you can't pay off studen...,1.597006e+18,2,0,Sun Nov 27 23:13:38 +0000 2022,1.933829e+09,1.596987e+18,2836421,...,False,1.933829e+09,226,46,1,NEUTRAL support,0.51,3,cannot judge importance,0.53
4,5,msnbc,@MSNBC @MaddowBlog The simple defense is why s...,1.597023e+18,1,0,Mon Nov 28 00:21:50 +0000 2022,1.586494e+18,1.596987e+18,2836421,...,False,1.586494e+18,159,27,0,FOR student loan forgiveness,0.88,1,Somewhat important,0.69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,464,foxnews,@FoxNews I don't need any bias media to tell m...,1.599859e+18,0,0,Mon Dec 05 20:09:32 +0000 2022,1.585731e+18,1.599767e+18,1367531,...,False,1.585731e+18,263,49,2,AGAINST student loan forgiveness,0.94,1,Somewhat important,0.53
464,465,foxnews,@FoxNews He still trying to get college studen...,1.599863e+18,4,0,Mon Dec 05 20:26:48 +0000 2022,1.581359e+18,1.599844e+18,1367531,...,False,1.581359e+18,136,23,0,FOR student loan forgiveness,0.54,0,Very important,0.92
465,466,usedgov,@usedgov why are my student loans not transfer...,1.599892e+18,0,0,Mon Dec 05 22:24:29 +0000 2022,7.925171e+17,,20437286,...,False,7.925171e+17,181,33,0,FOR student loan forgiveness,0.95,1,Somewhat important,0.79
466,467,foxnews,@FoxNews Just another way of screwing the taxp...,1.599894e+18,0,0,Mon Dec 05 22:32:26 +0000 2022,1.518825e+18,1.599351e+18,1367531,...,False,1.518825e+18,244,45,2,AGAINST student loan forgiveness,0.42,3,cannot judge importance,0.40


In [8]:
master_annotated_df.to_csv("../data/master_annotated.csv")