# **Statistical Analysis with Embeddings**



Install packages.

In [None]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, TFBertModel
import pandas as pd

In [None]:
with open('fs.txt', 'r') as file:
    json_data = file.read()
fs = json.loads(json_data)

Save direct answers of ChatGPT in dataframe.

In [None]:
fs_df = pd.DataFrame(fs['featuredSnippets'])
fs_df = fs_df[['id', 'text']]
fs_df["text"] = fs_df["text"].replace(r'<b>|<\/b>', '', regex=True)

In [None]:
fs_df.head()

Unnamed: 0,id,text
0,1,"Yes, intellectual property rights should exist..."
1,2,Creativity flourishes when ideas are freely sh...
2,3,Students should wear school uniforms because t...
3,4,Students should not wear school uniforms becau...
4,5,"Yes, obesity is indeed a disease. It significa..."


# Embedding / Similarity Analysis

Idea:
- turn ChatGPT's direct answers into embeddings
- turn explanations given before and after exposure into embeddings
- compute similarity between participants explanation BEFORE and ChatGPT's direct answer
- compute similarity between participants explanation AFTER and ChatGPT's direct answer

- analyse whether the second explanation is semantically closer to the direct answer compared to the first explanation -> would mean that participant internalizes the direct answer

Use [**bert base uncased**](https://huggingface.co/bert-base-uncased) as tokenizer and model.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="tf")
    outputs = model(**inputs)

    pooler_output = outputs.pooler_output.numpy()
    return pooler_output

Use above defined function to turn direct answers into embeddings.

In [None]:
fs_df["embedding"] = fs_df["text"].apply(lambda text: get_embeddings(text))

In [None]:
fs_df

Unnamed: 0,id,text,embedding
0,1,"Yes, intellectual property rights should exist...","[[-0.84653455, -0.48565817, -0.97497034, 0.750..."
1,2,Creativity flourishes when ideas are freely sh...,"[[-0.95321035, -0.6678994, -0.9848873, 0.89978..."
2,3,Students should wear school uniforms because t...,"[[-0.84305084, -0.5807782, -0.9747532, 0.72574..."
3,4,Students should not wear school uniforms becau...,"[[-0.8572791, -0.56048846, -0.94528526, 0.7776..."
4,5,"Yes, obesity is indeed a disease. It significa...","[[-0.77697986, -0.65852666, -0.96839446, 0.705..."
5,6,Obesity is not a disease; it's a condition aff...,"[[-0.75460476, -0.49428615, -0.94573283, 0.684..."


In [None]:
def get_similarity(embedding1, embedding2):
  return cosine_similarity(embedding1, embedding2)

Use merged dataframe, containing all explanations.

In [None]:
data = pd.read_csv("new.csv")

Clean dataframe with explanations by removing breaks.

In [None]:
data["explanation_before"] = data["explanation_before"].replace(r'<b>|<\/b>', '', regex=True)
data["explanation_after"] = data["explanation_after"].replace(r'<b>|<\/b>', '', regex=True)

Add column which includes each explanation turned into embeddings, resulting in two know columns (before and after exposure).

In [None]:
data["embedding_before"] = data["explanation_before"].apply(lambda text: get_embeddings(text))
data["embedding_after"] = data["explanation_after"].apply(lambda text: get_embeddings(text))

Then compute similarity between each explanation (before / after) and direct answer of ChatGPT.

In [None]:
data["similarity_before"] = data.apply(lambda row: get_similarity(row["embedding_before"], fs_df.iloc[0]["embedding"]), axis=1)
data["similarity_after"] = data.apply(lambda row: get_similarity(row["embedding_after"], fs_df.iloc[0]["embedding"]), axis=1)

Check if it worked:

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_id,age,gender,education,occupation,chatbotuse,chatbottrust,chatbotlimitations,chatbotperception,...,pages_visited,use_category,trust_category,limitations_category,perception_category,agreement,embedding_before,embedding_after,similarity_before,similarity_after
0,1,65584020487cba100c80ef22,23,male,high school,Student,7,1,5,4,...,0,1,-1,1,0,0,"[[-0.80638665, -0.3729, -0.9330029, 0.6453728,...","[[-0.85151964, -0.18382993, 0.32236716, 0.6583...",[[0.95963717]],[[0.63885856]]
1,2,65589725ee82f102021b3ec0,21,male,high school,Student,3,2,7,3,...,1,0,-1,1,-1,0,"[[-0.8532015, -0.7323214, -0.9867675, 0.803275...","[[-0.6780935, -0.2762213, -0.8991057, 0.542771...",[[0.98176897]],[[0.93398213]]
2,3,6558a6666689f1a401bce488,26,female,bachelor,Student,5,5,7,2,...,0,0,1,1,-1,0,"[[-0.8345886, -0.5706263, -0.99540925, 0.83408...","[[-0.81533647, -0.30690572, -0.7668058, 0.6441...",[[0.98287755]],[[0.92436635]]
3,4,6558b0f43b4b68ca704d8b77,25,male,high school,Student,6,6,6,4,...,0,1,1,1,0,0,"[[-0.30613565, -0.6223467, -0.99174017, 0.1480...","[[-0.85151964, -0.18382993, 0.32236716, 0.6583...",[[0.8726952]],[[0.63885856]]
4,5,6558b8d9cae6ee1507a08781,21,female,high school,student,4,2,5,2,...,1,0,-1,1,-1,0,"[[-0.85151964, -0.18382993, 0.32236716, 0.6583...","[[-0.8091694, -0.4641614, -0.98187053, 0.68391...",[[0.63885856]],[[0.9911997]]


Check where similarity afterwards is higher than before, e.g. similarity between first explanation without knowing the direct answer is 0.748, and after reading the chatbots answer the similartiy increased beyong 0.748.

In [None]:
for index, row in data.iterrows():
    if row["similarity_before"][0][0] < row["similarity_after"][0][0]:
        data.at[index, "explanation_change"] = 1
    else:
        data.at[index, "explanation_change"] = 0

data["explanation_change"] = data["explanation_change"].astype(int)

Check if it worked.

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_id,age,gender,education,occupation,chatbotuse,chatbottrust,chatbotlimitations,chatbotperception,...,use_category,trust_category,limitations_category,perception_category,agreement,embedding_before,embedding_after,similarity_before,similarity_after,explanation_change
0,1,65584020487cba100c80ef22,23,male,high school,Student,7,1,5,4,...,1,-1,1,0,0,"[[-0.80638665, -0.3729, -0.9330029, 0.6453728,...","[[-0.85151964, -0.18382993, 0.32236716, 0.6583...",[[0.95963717]],[[0.63885856]],0
1,2,65589725ee82f102021b3ec0,21,male,high school,Student,3,2,7,3,...,0,-1,1,-1,0,"[[-0.8532015, -0.7323214, -0.9867675, 0.803275...","[[-0.6780935, -0.2762213, -0.8991057, 0.542771...",[[0.98176897]],[[0.93398213]],0
2,3,6558a6666689f1a401bce488,26,female,bachelor,Student,5,5,7,2,...,0,1,1,-1,0,"[[-0.8345886, -0.5706263, -0.99540925, 0.83408...","[[-0.81533647, -0.30690572, -0.7668058, 0.6441...",[[0.98287755]],[[0.92436635]],0
3,4,6558b0f43b4b68ca704d8b77,25,male,high school,Student,6,6,6,4,...,1,1,1,0,0,"[[-0.30613565, -0.6223467, -0.99174017, 0.1480...","[[-0.85151964, -0.18382993, 0.32236716, 0.6583...",[[0.8726952]],[[0.63885856]],0
4,5,6558b8d9cae6ee1507a08781,21,female,high school,student,4,2,5,2,...,0,-1,1,-1,0,"[[-0.85151964, -0.18382993, 0.32236716, 0.6583...","[[-0.8091694, -0.4641614, -0.98187053, 0.68391...",[[0.63885856]],[[0.9911997]],1


It worked, now we can continue using the modified dataframe in R Studio.

# Sentiment Analysis

To enable further analysis using expressed sentiments, for each explanation its sentiment was classified (positive or negative).

First install packages again.

In [None]:
!pip3 install -q transformers
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

In [None]:
def get_sentiment(text):
  label = sentiment_pipeline(text)[0]["label"]
  if label == "POSITIVE":
    return 1
  elif label == "NEGATIVE":
    return 0

Add for each explanation its sentiments, using the above defined function.

In [None]:
data["sentiment_before"] = data["explanation_before"].apply(lambda text: get_sentiment(text))
data["sentiment_after"] = data["explanation_after"].apply(lambda text: get_sentiment(text))

Check if it worked.

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_id,age,gender,education,occupation,chatbotuse,chatbottrust,chatbotlimitations,chatbotperception,...,limitations_category,perception_category,agreement,embedding_before,embedding_after,similarity_before,similarity_after,explanation_change,sentiment_before,sentiment_after
0,1,65584020487cba100c80ef22,23,male,high school,Student,7,1,5,4,...,1,0,0,"[[-0.80638665, -0.3729, -0.9330029, 0.6453728,...","[[-0.85151964, -0.18382993, 0.32236716, 0.6583...",[[0.95963717]],[[0.63885856]],0,0,0
1,2,65589725ee82f102021b3ec0,21,male,high school,Student,3,2,7,3,...,1,-1,0,"[[-0.8532015, -0.7323214, -0.9867675, 0.803275...","[[-0.6780935, -0.2762213, -0.8991057, 0.542771...",[[0.98176897]],[[0.93398213]],0,0,0
2,3,6558a6666689f1a401bce488,26,female,bachelor,Student,5,5,7,2,...,1,-1,0,"[[-0.8345886, -0.5706263, -0.99540925, 0.83408...","[[-0.81533647, -0.30690572, -0.7668058, 0.6441...",[[0.98287755]],[[0.92436635]],0,0,0
3,4,6558b0f43b4b68ca704d8b77,25,male,high school,Student,6,6,6,4,...,1,0,0,"[[-0.30613565, -0.6223467, -0.99174017, 0.1480...","[[-0.85151964, -0.18382993, 0.32236716, 0.6583...",[[0.8726952]],[[0.63885856]],0,1,0
4,5,6558b8d9cae6ee1507a08781,21,female,high school,student,4,2,5,2,...,1,-1,0,"[[-0.85151964, -0.18382993, 0.32236716, 0.6583...","[[-0.8091694, -0.4641614, -0.98187053, 0.68391...",[[0.63885856]],[[0.9911997]],1,0,1


Check if sentiment in explanations has changed, and code it using a binary variable.

In [None]:
for index, row in data.iterrows():
    if (row["sentiment_before"] == 1 and row["sentiment_after"] == 0) or (row["sentiment_before"] == 0 and row["sentiment_after"] == 1):
        data.at[index, "sentiment_change"] = 1
    else:
        data.at[index, "sentiment_change"] = 0

data["sentiment_change"] = data["sentiment_change"].astype(int)

In [None]:
data['similarity_before'] = data['similarity_before'].apply(lambda x: x[0])
data['similarity_after'] = data['similarity_after'].apply(lambda x: x[0])

Check if it worked.

In [None]:
data

Unnamed: 0.1,Unnamed: 0,user_id,age,gender,education,occupation,chatbotuse,chatbottrust,chatbotlimitations,chatbotperception,...,perception_category,agreement,embedding_before,embedding_after,similarity_before,similarity_after,explanation_change,sentiment_before,sentiment_after,sentiment_change
0,1,65584020487cba100c80ef22,23,male,high school,Student,7,1,5,4,...,0,0,"[[-0.80638665, -0.3729, -0.9330029, 0.6453728,...","[[-0.85151964, -0.18382993, 0.32236716, 0.6583...",0.959637,0.638859,0,0,0,0
1,2,65589725ee82f102021b3ec0,21,male,high school,Student,3,2,7,3,...,-1,0,"[[-0.8532015, -0.7323214, -0.9867675, 0.803275...","[[-0.6780935, -0.2762213, -0.8991057, 0.542771...",0.981769,0.933982,0,0,0,0
2,3,6558a6666689f1a401bce488,26,female,bachelor,Student,5,5,7,2,...,-1,0,"[[-0.8345886, -0.5706263, -0.99540925, 0.83408...","[[-0.81533647, -0.30690572, -0.7668058, 0.6441...",0.982878,0.924366,0,0,0,0
3,4,6558b0f43b4b68ca704d8b77,25,male,high school,Student,6,6,6,4,...,0,0,"[[-0.30613565, -0.6223467, -0.99174017, 0.1480...","[[-0.85151964, -0.18382993, 0.32236716, 0.6583...",0.872695,0.638859,0,1,0,1
4,5,6558b8d9cae6ee1507a08781,21,female,high school,student,4,2,5,2,...,-1,0,"[[-0.85151964, -0.18382993, 0.32236716, 0.6583...","[[-0.8091694, -0.4641614, -0.98187053, 0.68391...",0.638859,0.9912,1,0,1,1
5,6,6558ba06605206699c96bfd6,19,female,high school,student,6,4,7,1,...,-1,1,"[[-0.8229376, -0.49053985, -0.98624814, 0.7002...","[[-0.58884513, -0.38051564, -0.9839325, 0.4601...",0.991534,0.930758,0,1,1,0
6,7,6558ba54cae6ee1507a08788,24,female,bachelor,Student,4,5,7,5,...,1,1,"[[-0.88316745, -0.52242076, -0.9675909, 0.8468...","[[-0.84759533, -0.6096273, -0.9941273, 0.78700...",0.990372,0.986472,0,0,1,1
7,8,6558bba4cae6ee1507a0878f,28,female,master,Student,7,3,7,1,...,-1,1,"[[-0.86018896, -0.5135671, -0.9909919, 0.79809...","[[-0.8469067, -0.37591797, -0.7422377, 0.74781...",0.992305,0.908323,0,1,0,1
8,9,6558bca8cae6ee1507a08799,22,female,high school,University undergraduate,4,1,6,3,...,-1,0,"[[-0.76606196, -0.54923296, -0.98116505, 0.788...","[[-0.9078929, -0.6212592, -0.96973705, 0.81481...",0.985564,0.959377,0,0,1,1
9,10,6558bd11605206699c96bfe1,20,female,high school,Student,5,3,6,4,...,0,0,"[[-0.8966248, -0.6278881, -0.9386859, 0.780431...","[[-0.80750895, -0.5241385, -0.9492578, 0.70882...",0.974962,0.972025,0,0,0,0


Completed! Now saving the dataframe to edit it further.

In [None]:
data.to_csv('df_sentiment.csv', index=False)