In [None]:
import glob

import pandas as pd

from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
import torch

from tqdm import tqdm

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_base_dir = "data/by_date"

dates = []
dfs = {}
for fpath in glob.glob(data_base_dir + "/*"):
    df = pd.read_csv(fpath)
    date = df["date"].iloc[0]
    dfs[date] = df[df["question"].notnull()]
    dates.append(date)

dates.sort()

In [3]:
dfs[dates[0]]

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question,similarity_cls,similarity_max,similarity_mean,similarity_sbert
0,John Kennedy,"Well, the Vice President and I came to the Con...",Pres,1960,1960-09-26,1,What are the differences in party records and ...,"Senator, the Vice President in his campaign ha...",0.816746,0.924939,0.78205,0.177822
1,Richard Nixon,It would be rather difficult to cover them in ...,Pres,1960,1960-09-26,1,What recommendations have been made following ...,"Mr. Vice President, your campaign stresses the...",0.837739,0.929648,0.794823,0.347191
2,John Kennedy,"Well, because I think that if the federal gove...",Pres,1960,1960-09-26,1,What role should the federal government play i...,"Id like to ask this; Its a fact, I think, that...",0.78549,0.920841,0.752912,0.473955
3,Richard Nixon,"Well, I would suggest, Mr. Vanocur, that uh – ...",Pres,1960,1960-09-26,1,What is the appropriate role of presidential a...,"Uh – Mr. Vice President, since the question of...",0.779024,0.918841,0.681261,0.286205
4,John Kennedy,I didnt indicate. I did not advocate reducing ...,Pres,1960,1960-09-26,1,Why should the federal debt not be a priority ...,"Senator Kennedy, in connection with these prob...",0.779449,0.91361,0.766949,0.489217
5,John Kennedy,"– reducing the interest rate. In my judgment, ...",Pres,1960,1960-09-26,1,What economic policies should be implemented t...,"Senator, I believe in – in one of your speeche...",0.82219,0.903647,0.733769,0.318732
6,Richard Nixon,Im awfully glad you ge- got that question beca...,Pres,1960,1960-09-26,1,Why should the federal government avoid direct...,Mr. Vice President you mentioned schools and i...,0.735237,0.909232,0.706338,0.550105
7,John Kennedy,"If I may take the bills, we did pass in the Se...",Pres,1960,1960-09-26,1,What factors have contributed to the failure o...,"Senator, youve been promising the voters that ...",0.789777,0.941709,0.775012,0.431711
8,John Kennedy,"Well, I think theyre serious. I think its a ma...",Pres,1960,1960-09-26,1,What is the United States' approach to address...,"Senator Kennedy, on another subject, Communism...",0.831328,0.90486,0.736262,0.420495
9,Richard Nixon,Not at all. As a matter of fact your question ...,Pres,1960,1960-09-26,1,What factors contribute to the growth and prog...,Mr. Vice President uh – in one of your earlier...,0.780539,0.900908,0.70627,0.220092


In [4]:
sum([len(df) for df in dfs.values()])

271

In [5]:
for i, d in enumerate(dates):
    print(f"{i}\t{d}\t{len(dfs[d])}\t{len(dfs[d].columns)}")

0	1960-09-26	10	12
1	1976-09-23	20	12
2	1980-09-21	12	12
3	1984-10-07	30	12
4	1988-09-25	23	12
5	1992-10-11	11	12
6	1996-10-06	21	12
7	2000-10-03	26	12
8	2004-09-30	18	12
9	2008-09-26	17	12
10	2012-10-03	25	12
11	2016-09-26	19	12
12	2020-09-29	39	12


In [6]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

In [7]:
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

In [8]:
def encode_text(text, sbert=False, pooling_method="cls"):
    if sbert:
        embedding = (
            sbert_model.encode(text, convert_to_tensor=True).cpu().reshape(1, -1)
        )
    else:
        inputs = bert_tokenizer(
            text, return_tensors="pt", truncation=True, padding=True, max_length=512
        )
        with torch.no_grad():
            outputs = bert_model(**inputs)

        # Use the [CLS] token's embedding as the sentence embedding
        if pooling_method == "cls":
            embedding = outputs.last_hidden_state[:, 0, :]
        elif pooling_method == "max":
            embedding = outputs.last_hidden_state.max(dim=1).values
        elif pooling_method == "mean":
            embedding = outputs.last_hidden_state.mean(dim=1)
    return embedding

In [9]:
def calculate_similarity(text1, text2, sbert=False, pooling_method="cls"):
    embedding1 = encode_text(text1, sbert=sbert, pooling_method=pooling_method)
    embedding2 = encode_text(text2, sbert=sbert, pooling_method=pooling_method)

    # Calculate cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]

In [10]:
calculate_similarity("test1", "test2")

np.float32(0.97823465)

In [11]:
calculate_similarity("test1", "test2", pooling_method="max")

np.float32(0.9586725)

In [12]:
calculate_similarity("test1", "test2", sbert=True)

np.float32(0.8835417)

In [13]:
i = 0

In [14]:
dfs[dates[i]]

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question,similarity_cls,similarity_max,similarity_mean,similarity_sbert
0,John Kennedy,"Well, the Vice President and I came to the Con...",Pres,1960,1960-09-26,1,What are the differences in party records and ...,"Senator, the Vice President in his campaign ha...",0.816746,0.924939,0.78205,0.177822
1,Richard Nixon,It would be rather difficult to cover them in ...,Pres,1960,1960-09-26,1,What recommendations have been made following ...,"Mr. Vice President, your campaign stresses the...",0.837739,0.929648,0.794823,0.347191
2,John Kennedy,"Well, because I think that if the federal gove...",Pres,1960,1960-09-26,1,What role should the federal government play i...,"Id like to ask this; Its a fact, I think, that...",0.78549,0.920841,0.752912,0.473955
3,Richard Nixon,"Well, I would suggest, Mr. Vanocur, that uh – ...",Pres,1960,1960-09-26,1,What is the appropriate role of presidential a...,"Uh – Mr. Vice President, since the question of...",0.779024,0.918841,0.681261,0.286205
4,John Kennedy,I didnt indicate. I did not advocate reducing ...,Pres,1960,1960-09-26,1,Why should the federal debt not be a priority ...,"Senator Kennedy, in connection with these prob...",0.779449,0.91361,0.766949,0.489217
5,John Kennedy,"– reducing the interest rate. In my judgment, ...",Pres,1960,1960-09-26,1,What economic policies should be implemented t...,"Senator, I believe in – in one of your speeche...",0.82219,0.903647,0.733769,0.318732
6,Richard Nixon,Im awfully glad you ge- got that question beca...,Pres,1960,1960-09-26,1,Why should the federal government avoid direct...,Mr. Vice President you mentioned schools and i...,0.735237,0.909232,0.706338,0.550105
7,John Kennedy,"If I may take the bills, we did pass in the Se...",Pres,1960,1960-09-26,1,What factors have contributed to the failure o...,"Senator, youve been promising the voters that ...",0.789777,0.941709,0.775012,0.431711
8,John Kennedy,"Well, I think theyre serious. I think its a ma...",Pres,1960,1960-09-26,1,What is the United States' approach to address...,"Senator Kennedy, on another subject, Communism...",0.831328,0.90486,0.736262,0.420495
9,Richard Nixon,Not at all. As a matter of fact your question ...,Pres,1960,1960-09-26,1,What factors contribute to the growth and prog...,Mr. Vice President uh – in one of your earlier...,0.780539,0.900908,0.70627,0.220092


In [15]:
dfs[dates[i]]["similarity_cls"] = dfs[dates[i]].progress_apply(
    lambda row: calculate_similarity(row["qud"], row["question"]), axis=1
)
dfs[dates[i]]["similarity_max"] = dfs[dates[i]].progress_apply(
    lambda row: calculate_similarity(row["qud"], row["question"], pooling_method="max"),
    axis=1,
)
dfs[dates[i]]["similarity_mean"] = dfs[dates[i]].progress_apply(
    lambda row: calculate_similarity(
        row["qud"], row["question"], pooling_method="mean"
    ),
    axis=1,
)
dfs[dates[i]]["similarity_sbert"] = dfs[dates[i]].progress_apply(
    lambda row: calculate_similarity(row["qud"], row["question"], sbert=True), axis=1
)

100%|██████████| 10/10 [00:00<00:00, 10.21it/s]
100%|██████████| 10/10 [00:00<00:00, 10.26it/s]
100%|██████████| 10/10 [00:00<00:00, 11.12it/s]
100%|██████████| 10/10 [00:01<00:00,  7.97it/s]


In [16]:
dfs[dates[i]]

Unnamed: 0,speaker,text,type,election_year,date,candidate,qud,question,similarity_cls,similarity_max,similarity_mean,similarity_sbert
0,John Kennedy,"Well, the Vice President and I came to the Con...",Pres,1960,1960-09-26,1,What are the differences in party records and ...,"Senator, the Vice President in his campaign ha...",0.816746,0.924939,0.78205,0.177822
1,Richard Nixon,It would be rather difficult to cover them in ...,Pres,1960,1960-09-26,1,What recommendations have been made following ...,"Mr. Vice President, your campaign stresses the...",0.837739,0.929649,0.794823,0.347191
2,John Kennedy,"Well, because I think that if the federal gove...",Pres,1960,1960-09-26,1,What role should the federal government play i...,"Id like to ask this; Its a fact, I think, that...",0.78549,0.920841,0.752912,0.473956
3,Richard Nixon,"Well, I would suggest, Mr. Vanocur, that uh – ...",Pres,1960,1960-09-26,1,What is the appropriate role of presidential a...,"Uh – Mr. Vice President, since the question of...",0.779024,0.918841,0.681261,0.286205
4,John Kennedy,I didnt indicate. I did not advocate reducing ...,Pres,1960,1960-09-26,1,Why should the federal debt not be a priority ...,"Senator Kennedy, in connection with these prob...",0.779449,0.91361,0.766949,0.489217
5,John Kennedy,"– reducing the interest rate. In my judgment, ...",Pres,1960,1960-09-26,1,What economic policies should be implemented t...,"Senator, I believe in – in one of your speeche...",0.82219,0.903647,0.733769,0.318732
6,Richard Nixon,Im awfully glad you ge- got that question beca...,Pres,1960,1960-09-26,1,Why should the federal government avoid direct...,Mr. Vice President you mentioned schools and i...,0.735237,0.909232,0.706338,0.550105
7,John Kennedy,"If I may take the bills, we did pass in the Se...",Pres,1960,1960-09-26,1,What factors have contributed to the failure o...,"Senator, youve been promising the voters that ...",0.789777,0.941709,0.775012,0.431711
8,John Kennedy,"Well, I think theyre serious. I think its a ma...",Pres,1960,1960-09-26,1,What is the United States' approach to address...,"Senator Kennedy, on another subject, Communism...",0.831328,0.90486,0.736262,0.420495
9,Richard Nixon,Not at all. As a matter of fact your question ...,Pres,1960,1960-09-26,1,What factors contribute to the growth and prog...,Mr. Vice President uh – in one of your earlier...,0.780539,0.900908,0.70627,0.220092


In [17]:
for i in tqdm(range(len(dates))):
    dfs[dates[i]]["similarity_cls"] = dfs[dates[i]].apply(
        lambda row: calculate_similarity(row["qud"], row["question"]), axis=1
    )
    dfs[dates[i]]["similarity_max"] = dfs[dates[i]].apply(
        lambda row: calculate_similarity(
            row["qud"], row["question"], pooling_method="max"
        ),
        axis=1,
    )
    dfs[dates[i]]["similarity_mean"] = dfs[dates[i]].apply(
        lambda row: calculate_similarity(
            row["qud"], row["question"], pooling_method="mean"
        ),
        axis=1,
    )
    dfs[dates[i]]["similarity_sbert"] = dfs[dates[i]].apply(
        lambda row: calculate_similarity(row["qud"], row["question"], sbert=True),
        axis=1,
    )
    dfs[dates[i]].to_csv(f"{data_base_dir}/{dates[i]}.csv", index=False)

100%|██████████| 13/13 [01:48<00:00,  8.37s/it]
