In [1]:
## Add this directory to the path and load our functions
import sys
sys.path.append("../src/")

import paware

from data_processing import sentiment_data

import polars as pl
import os

# Adding Metadata

## Agree and Disagree Distances

We can use our embedding to compute an "agree distance" and a "disagree distance" for every text chunk.

We do this by embedding the following statements:

```python
agree_statements = ["This is correct", "This is true", "I agree", "This is helpful"]

disagree_statements = ["This is incorrect", "This is false", "I disagree", "This is not helpful"]
```

Then, we compute the average distance between the vector representation of each text chunk and each of these statements.

Then, we look at the the replies to any given post, and aggregate information about these distances for those replies.

This is all automated by our embedding class. Here, we continue with the same embedding we generated in [Step 1]("../paw_demo/Step_1_Loading_and_Embedding_Data.ipynb").

In [2]:
## Define the embedding tool
embedding_tool = paware.PawEmbedding(
    CONFIG_NAME = "demo",
    RAW_DATA_DIR = "../paw_demo/raw_subs/",
    EMBEDDED_SAVE_DIR =  "../paw_demo/embedded_subs/",
    BATCH_SIZE = 100000,
    CHUNK_WITH_METADATA = True,
    CHUNK_SIZE = 512,
    CHUNK_OVERLAP_PCT = 0.2
)

## Embed the data
embedding_tool.add_agree_disagree_distances()



## Sentiment of Replies

Having generated the sentiment data, we can now attached it to our embedded data.

In [3]:
sentiment_data.copy_sentiment_data(vec_dir = "../paw_demo/embedded_subs/config_demo/",
                                   sen_dir = "../paw_demo/raw_sentiment/" )

## Verifying the Results

We can load the data now, and verify that it includes the additional engineered metadata data.

In [4]:
embedding_dir = "../paw_demo/embedded_subs/config_demo/"
test = pl.read_parquet(
    embedding_dir+"vectorized_generalmotors_data_complete.parquet")

test.head(5)

aware_post_type,aware_created_ts,reddit_id,reddit_name,reddit_created_utc,reddit_author,reddit_text,reddit_permalink,reddit_title,reddit_url,reddit_subreddit,reddit_link_id,reddit_parent_id,reddit_submission,reply_ids,is_short_question,text_chunk,chunk_agree_distance,chunk_disagree_distance,chunk_agree_distance_avg,chunk_disagree_distance_avg,agree_distance_avg,disagree_distance_avg,reply_agree_distances,reply_disagree_distances,top_reply_agree_distance,top_reply_disagree_distance,avg_reply_agree_distance,avg_reply_disagree_distance,summed_sentiments,absolute_summed_sentiment,vector
str,str,str,str,i64,str,str,str,str,str,str,str,str,str,list[str],bool,str,list[f64],list[f64],f64,f64,f64,f64,list[f64],list[f64],f64,f64,f64,f64,i64,i32,list[f64]
"""submission""","""2024-01-05T10:…","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM m…","""/r/GeneralMoto…","""References""","""https://www.re…","""GeneralMotors""",,,,"[""t1_kgfznkx"", ""t1_kgg05vt""]",False,"""General Motors…","[0.283116, 0.300825, … 0.254619]","[0.27071, 0.273402, … 0.261558]",0.282383,0.275515,0.276075,0.273053,"[0.249591, 0.238443]","[0.23238, 0.247751]",0.249591,0.23238,0.244017,0.240066,-2,2,"[0.018657, 0.037279, … 0.037011]"
"""submission""","""2024-01-05T10:…","""18z96ud""","""t3_18z96ud""",1704467742,"""Jmill2000""","""So I know GM m…","""/r/GeneralMoto…","""References""","""https://www.re…","""GeneralMotors""",,,,"[""t1_kgfznkx"", ""t1_kgg05vt""]",False,"""General Motors…","[0.266819, 0.28803, … 0.250238]","[0.260055, 0.269114, … 0.254161]",0.269768,0.270592,0.276075,0.273053,"[0.249591, 0.238443]","[0.23238, 0.247751]",0.249591,0.23238,0.244017,0.240066,-2,2,"[0.019537, 0.014495, … 0.008045]"
"""comment""","""2024-01-05T10:…","""kgfznkx""","""t1_kgfznkx""",1704468972,"""warwolf0""","""Yea, that’s a …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud""",,False,"""General Motors…","[0.248872, 0.249956, … 0.255405]","[0.229161, 0.224686, … 0.235727]",0.249591,0.23238,0.249591,0.23238,,,,,,,0,0,"[0.0408, 0.030147, … -0.004045]"
"""comment""","""2024-01-05T10:…","""kgg05vt""","""t1_kgg05vt""",1704469154,"""tossedawaytheq…","""Search Socrate…","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t3_18z96ud""","""18z96ud""","[""t1_kgg0rjf"", ""t1_kgg79j0""]",False,"""General Motors…","[0.230886, 0.252112, … 0.221663]","[0.234782, 0.238319, … 0.247299]",0.238443,0.247751,0.238443,0.247751,"[0.257003, 0.262228]","[0.239812, 0.258668]",0.257003,0.239812,0.259616,0.24924,-2,2,"[0.02871, 0.029767, … 0.022664]"
"""comment""","""2024-01-05T10:…","""kgg0rjf""","""t1_kgg0rjf""",1704469370,"""Ripinpasta69""","""Only works if …","""/r/GeneralMoto…",,,"""GeneralMotors""","""t3_18z96ud""","""t1_kgg05vt""","""18z96ud""",,False,"""General Motors…","[0.251144, 0.250768, … 0.261878]","[0.216444, 0.228874, … 0.249519]",0.257003,0.239812,0.257003,0.239812,,,,,,,0,0,"[0.012741, 0.031151, … 0.01205]"
