In [18]:
import glob

import pandas as pd

from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch

from tqdm import tqdm

tqdm.pandas()

In [19]:
data_base_dir = "data/by_date"

dates = []
dfs = {}
for fpath in glob.glob(data_base_dir + "/*"):
    df = pd.read_csv(fpath)
    date = df["date"].iloc[0]
    dfs[date] = df  # [df["question"].notnull()]
    dates.append(date)

dates.sort()

In [20]:
dfs[dates[0]]

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question
0,Howard Smith,Good evening. The television and radio station...,Pres,1960,1960-09-26,0,,
1,John Kennedy,"Mr. Smith, Mr. Nixon. In the election of 1860,...",Pres,1960,1960-09-26,1,,
2,Howard Smith,And now the opening statement by Vice Presiden...,Pres,1960,1960-09-26,0,,
3,Richard Nixon,"Mr. Smith, Senator Kennedy. The things that Se...",Pres,1960,1960-09-26,1,,
4,Howard Smith,"Thank you, Mr. Nixon. That completes the openi...",Pres,1960,1960-09-26,0,,
...,...,...,...,...,...,...,...,...
63,Howard Smith,Three minutes and twenty seconds for each cand...,Pres,1960,1960-09-26,0,,
64,Richard Nixon,"Thank you, Mr. Smith. Senator Kennedy. First o...",Pres,1960,1960-09-26,1,,
65,Howard Smith,"Senator Kennedy, your conclusion.",Pres,1960,1960-09-26,0,,
66,John Kennedy,The point was made by Mr. Nixon that the Sovie...,Pres,1960,1960-09-26,1,,


In [21]:
dfs[dates[0]][dfs[dates[0]]["question"].notnull()]

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question
6,John Kennedy,"Well, the Vice President and I came to the Con...",Pres,1960,1960-09-26,1,Which political party's policies and leadershi...,"Senator, the Vice President in his campaign ha..."
11,Richard Nixon,It would be rather difficult to cover them in ...,Pres,1960,1960-09-26,1,What recommendations have you made following y...,"Mr. Vice President, your campaign stresses the..."
17,John Kennedy,"Well, because I think that if the federal gove...",Pres,1960,1960-09-26,1,What role should the federal government play i...,"Id like to ask this; Its a fact, I think, that..."
21,Richard Nixon,"Well, I would suggest, Mr. Vanocur, that uh – ...",Pres,1960,1960-09-26,1,What is the role of presidential advisers in d...,"Uh – Mr. Vice President, since the question of..."
26,John Kennedy,I didnt indicate. I did not advocate reducing ...,Pres,1960,1960-09-26,1,Why do you believe it is not feasible to signi...,"Senator Kennedy, in connection with these prob..."
32,John Kennedy,"– reducing the interest rate. In my judgment, ...",Pres,1960,1960-09-26,1,What economic policies should be implemented t...,"Senator, I believe in – in one of your speeche..."
37,Richard Nixon,Im awfully glad you ge- got that question beca...,Pres,1960,1960-09-26,1,Why should the federal government avoid taking...,Mr. Vice President you mentioned schools and i...
44,John Kennedy,"If I may take the bills, we did pass in the Se...",Pres,1960,1960-09-26,1,What challenges does Congress face in passing ...,"Senator, youve been promising the voters that ..."
53,John Kennedy,"Well, I think theyre serious. I think its a ma...",Pres,1960,1960-09-26,1,What should the United States do to address in...,"Senator Kennedy, on another subject, Communism..."
58,Richard Nixon,Not at all. As a matter of fact your question ...,Pres,1960,1960-09-26,1,What factors should be considered when assessi...,Mr. Vice President uh – in one of your earlier...


In [22]:
sum([len(df) for df in dfs.values()])

2716

In [24]:
sum([len(df[df["question"].notnull()]) for df in dfs.values()])

271

In [37]:
for i, d in enumerate(dates):
    print(f"{i}\t{d}\t{len(dfs[d])}\t{len(dfs[d].columns)}")

0	1960-09-26	68	8
1	1976-09-23	94	8
2	1980-09-21	76	8
3	1984-10-07	134	8
4	1988-09-25	161	8
5	1992-10-11	92	8
6	1996-10-06	144	8
7	2000-10-03	166	8
8	2004-09-30	142	8
9	2008-09-26	189	8
10	2012-10-03	210	8
11	2016-09-26	308	8
12	2020-09-29	932	8


In [39]:
for i, d in enumerate(dates):
    df_f = dfs[d][dfs[d]["question"].notnull()]
    print(f"{i}\t{d}\t{len(df_f)}\t{len(df_f.columns)}")

0	1960-09-26	10	8
1	1976-09-23	20	8
2	1980-09-21	12	8
3	1984-10-07	30	8
4	1988-09-25	23	8
5	1992-10-11	11	8
6	1996-10-06	21	8
7	2000-10-03	26	8
8	2004-09-30	18	8
9	2008-09-26	17	8
10	2012-10-03	25	8
11	2016-09-26	19	8
12	2020-09-29	39	8


In [40]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [41]:
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

In [42]:
def encode_text(text, sbert=False, pooling_method="cls"):
    if sbert:
        embedding = (
            sbert_model.encode(text, convert_to_tensor=True).cpu().reshape(1, -1)
        )
    else:
        inputs = bert_tokenizer(
            text, return_tensors="pt", truncation=True, padding=True, max_length=512
        )
        with torch.no_grad():
            outputs = bert_model(**inputs)

        # Use the [CLS] token's embedding as the sentence embedding
        if pooling_method == "cls":
            embedding = outputs.last_hidden_state[:, 0, :]
        elif pooling_method == "max":
            embedding = outputs.last_hidden_state.max(dim=1).values
        elif pooling_method == "mean":
            embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding

In [43]:
def calculate_similarity(text1, text2, sbert=False, pooling_method="cls"):
    embedding1 = encode_text(text1, sbert=sbert, pooling_method=pooling_method)
    embedding2 = encode_text(text2, sbert=sbert, pooling_method=pooling_method)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]

In [44]:
calculate_similarity("test1", "test2")

np.float32(0.97823465)

In [45]:
calculate_similarity("test1", "test2", pooling_method="max")

np.float32(0.9586725)

In [46]:
calculate_similarity("test1", "test2", sbert=True)

np.float32(0.8835417)

In [47]:
i = 0

In [49]:
dfs[dates[i]]

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question
0,Howard Smith,Good evening. The television and radio station...,Pres,1960,1960-09-26,0,,
1,John Kennedy,"Mr. Smith, Mr. Nixon. In the election of 1860,...",Pres,1960,1960-09-26,1,,
2,Howard Smith,And now the opening statement by Vice Presiden...,Pres,1960,1960-09-26,0,,
3,Richard Nixon,"Mr. Smith, Senator Kennedy. The things that Se...",Pres,1960,1960-09-26,1,,
4,Howard Smith,"Thank you, Mr. Nixon. That completes the openi...",Pres,1960,1960-09-26,0,,
...,...,...,...,...,...,...,...,...
63,Howard Smith,Three minutes and twenty seconds for each cand...,Pres,1960,1960-09-26,0,,
64,Richard Nixon,"Thank you, Mr. Smith. Senator Kennedy. First o...",Pres,1960,1960-09-26,1,,
65,Howard Smith,"Senator Kennedy, your conclusion.",Pres,1960,1960-09-26,0,,
66,John Kennedy,The point was made by Mr. Nixon that the Sovie...,Pres,1960,1960-09-26,1,,


In [50]:
dfs[dates[i]]["similarity_cls"] = dfs[dates[i]].progress_apply(
    lambda row: (
        calculate_similarity(row["qud"], row["question"])
        if not pd.isna(row["question"])
        else None
    ),
    axis=1,
)
dfs[dates[i]]["similarity_max"] = dfs[dates[i]].progress_apply(
    lambda row: (
        calculate_similarity(row["qud"], row["question"], pooling_method="max")
        if not pd.isna(row["question"])
        else None
    ),
    axis=1,
)
dfs[dates[i]]["similarity_mean"] = dfs[dates[i]].progress_apply(
    lambda row: (
        calculate_similarity(row["qud"], row["question"], pooling_method="mean")
        if not pd.isna(row["question"])
        else None
    ),
    axis=1,
)
dfs[dates[i]]["similarity_sbert"] = dfs[dates[i]].progress_apply(
    lambda row: (
        calculate_similarity(row["qud"], row["question"], sbert=True)
        if not pd.isna(row["question"])
        else None
    ),
    axis=1,
)

100%|██████████| 68/68 [00:01<00:00, 37.96it/s]
100%|██████████| 68/68 [00:01<00:00, 42.81it/s]
100%|██████████| 68/68 [00:01<00:00, 40.26it/s]
100%|██████████| 68/68 [00:01<00:00, 43.27it/s]


In [51]:
dfs[dates[i]]

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question,similarity_cls,similarity_max,similarity_mean,similarity_sbert
0,Howard Smith,Good evening. The television and radio station...,Pres,1960,1960-09-26,0,,,,,,
1,John Kennedy,"Mr. Smith, Mr. Nixon. In the election of 1860,...",Pres,1960,1960-09-26,1,,,,,,
2,Howard Smith,And now the opening statement by Vice Presiden...,Pres,1960,1960-09-26,0,,,,,,
3,Richard Nixon,"Mr. Smith, Senator Kennedy. The things that Se...",Pres,1960,1960-09-26,1,,,,,,
4,Howard Smith,"Thank you, Mr. Nixon. That completes the openi...",Pres,1960,1960-09-26,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
63,Howard Smith,Three minutes and twenty seconds for each cand...,Pres,1960,1960-09-26,0,,,,,,
64,Richard Nixon,"Thank you, Mr. Smith. Senator Kennedy. First o...",Pres,1960,1960-09-26,1,,,,,,
65,Howard Smith,"Senator Kennedy, your conclusion.",Pres,1960,1960-09-26,0,,,,,,
66,John Kennedy,The point was made by Mr. Nixon that the Sovie...,Pres,1960,1960-09-26,1,,,,,,


In [52]:
for i in tqdm(range(len(dates))):
    dfs[dates[i]]["similarity_cls"] = dfs[dates[i]].apply(
        lambda row: (
            calculate_similarity(row["qud"], row["question"])
            if not pd.isna(row["question"])
            else None
        ),
        axis=1,
    )
    dfs[dates[i]]["similarity_max"] = dfs[dates[i]].apply(
        lambda row: (
            calculate_similarity(row["qud"], row["question"], pooling_method="max")
            if not pd.isna(row["question"])
            else None
        ),
        axis=1,
    )
    dfs[dates[i]]["similarity_mean"] = dfs[dates[i]].apply(
        lambda row: (
            calculate_similarity(row["qud"], row["question"], pooling_method="mean")
            if not pd.isna(row["question"])
            else None
        ),
        axis=1,
    )
    dfs[dates[i]]["similarity_sbert"] = dfs[dates[i]].apply(
        lambda row: (
            calculate_similarity(row["qud"], row["question"], sbert=True)
            if not pd.isna(row["question"])
            else None
        ),
        axis=1,
    )
    dfs[dates[i]].to_csv(f"{data_base_dir}/{dates[i]}.csv", index=False)

100%|██████████| 13/13 [05:43<00:00, 26.42s/it]
