In [1]:
# Install required packages
!pip install wordfreq sentence-transformers gensim pandarallel

# Import necessary libraries
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from gensim.models import Word2Vec
from scipy.stats import zscore
from wordfreq import top_n_list
from sentence_transformers import SentenceTransformer, models, util
import torch
import pickle
import argparse
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=100)



Collecting wordfreq
  Downloading wordfreq-3.1.1-py3-none-any.whl.metadata (27 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting ftfy>=6.1 (from wordfreq)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Collecting locate<2.0.0,>=1.1.1 (from wordfreq)
  Downloading locate-1.1.1-py3-none-any.whl.metadata (3.9 kB)
Downloading wordfreq-3.1.1-py3-none-any.whl (56.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

  from tqdm.autonotebook import tqdm, trange
2024-06-23 11:58:03.624320: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-23 11:58:03.624414: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-23 11:58:03.767794: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


INFO: Pandarallel will run on 100 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
top100 = top_n_list('de', 100)

def count_top100(s):
    return len([1 for w in s.split() if w in set(top100)])


# Define functions and main script
def config(parser):
    parser.add_argument('--model_name_or_path', default='/kaggle/input/word2vec-new/word2vec_new.model')
    parser.add_argument('--input_file', default='/kaggle/input/parliament2/speeches_clean.csv')
    parser.add_argument('--output_file', default='speeches_new_emi.csv')
    parser.add_argument('--evidence_lexicon', default='/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv')
    parser.add_argument('--intuition_lexicon', default='/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv')
    parser.add_argument('--save_embeddings', action="store_true")
    parser.add_argument('--smoke_test', action="store_true")
    parser.add_argument('--text_column', type=str, default='speechContent')
    parser.add_argument('--compression_type', type=str, default='infer')
    parser.add_argument('--length_threshold', type=int, default=10)
    parser.add_argument('--tab_delimiter', action="store_true")
    parser.add_argument('--chunk_text', action="store_true")
    parser.add_argument('--min_chunk_length', type=int, default=50)
    parser.add_argument('--max_chunk_length', type=int, default=150)
    parser.add_argument('--id_column', type=str, default="id")
    return parser 

def preprocess(df, args):
    def remove_special_characters(text):
        pattern = r'[^a-zA-Z0-9äöüÄÖÜ\s]'
        clean_text = re.sub(pattern, '', text)
        return clean_text

    df['text'] = df['text'].astype(str)
    df['text'].replace(to_replace=r"\.\.+", value=" ", regex=True, inplace=True)
    df['text'].replace(to_replace=r"\-\-+", value=" ", regex=True, inplace=True)
    df['text'].replace(to_replace=r"__+", value=" ", regex=True, inplace=True)
    df['text'].replace(to_replace=r"\*\*+", value=" ", regex=True, inplace=True)
    df['text'].replace(to_replace=r"\s+", value=" ", regex=True, inplace=True)
    df['text'] = df['text'].progress_apply(remove_special_characters)
    df['length'] = df['text'].progress_apply(lambda x: len(x.split()))
    df = df[df['length'] > args.length_threshold]
    print(f"Average Speech length: {[df['length'].mean()]}")
    
    # Count top 100 words
    df['tokens_top100'] = df.text.progress_apply(count_top100)
    df['fraction_top100'] = df.tokens_top100 / df.length
    
    # Filter based on top 100 words fraction
    try:
        print('Sample of texts with low fraction of top 100 words:', df[df.fraction_top100 < 0.05].sample(10).text.tolist())
        df = df[df.fraction_top100 >= 0.05]
        print(f"Number of rows after top 100 words filtering: {len(df)}")
    except:
        print('No rows to sample for top 100 words filtering')

    if args.chunk_text:
        def chunk_by_length(x):
            words = x.split()
            if len(words) > args.max_chunk_length:
                chunks = [words[i:i+args.max_chunk_length] for i in range(0, len(words), args.max_chunk_length)]
                last_chunk_length = len(chunks[-1])
                if len(chunks) > 1 and last_chunk_length < args.min_chunk_length:
                    chunks[-2] = chunks[-2] + chunks[-1]
                    del chunks[-1]
                chunked = [" ".join(chunk) for chunk in chunks]
            else:
                chunked = [" ".join(words)]
            return chunked 

        df['text'] = df.text.progress_apply(chunk_by_length)
        df = df.explode("text", ignore_index=True)
        df = df.drop_duplicates(subset=['text']+[f'{args.id_column}'])
        df['chunk_length'] = df.text.progress_apply(lambda x: len(x.split()))
    return df

class CustomTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab

    def tokenize(self, text):
        return [self.vocab[word] for word in text.lower().split() if word in self.vocab]

def create_custom_model(word2vec_model_path):
    # Load the Word2Vec model
    word2vec = Word2Vec.load(word2vec_model_path)
    embedding_weights = torch.FloatTensor(word2vec.wv.vectors)
    vocab = word2vec.wv.key_to_index

    # Create a custom tokenizer
    custom_tokenizer = CustomTokenizer(vocab)

    # Create WordEmbeddings module
    w2v_embeddings = models.WordEmbeddings(vocab, embedding_weights)
    w2v_embeddings.tokenizer = custom_tokenizer

    # Create Pooling module
    pooling_layer = models.Pooling(w2v_embeddings.get_word_embedding_dimension())

    # Create SentenceTransformer model
    custom_model = SentenceTransformer(modules=[w2v_embeddings, pooling_layer])

    return custom_model

def get_embeddings(texts, model):
    # Ensure texts is a list of strings
    if isinstance(texts, str):
        texts = [texts]
        
    # Set device to CUDA if available, else CPU
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
    # Move model to the specified device
    model = model.to(device)
        

    # Encode the text using the custom model
    try:
        corpus_embeddings = model.encode(texts, batch_size=1024, show_progress_bar=True, convert_to_tensor=True, device=device)
        assert len(corpus_embeddings) == len(texts)
        return corpus_embeddings
    except RuntimeError as e:
        if 'out of memory' in str(e):
            print("Out of memory error. Switching to CPU.")
            device = torch.device("cpu")
            model = model.to(device)
            corpus_embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_tensor=True, device=device)
            assert len(corpus_embeddings) == len(texts)
            return corpus_embeddings
        else:
            raise e
            return corpus_embeddings


def length_adjustment_bin(df, length_column='chunk_length', minimum_length=10):
    bins = range(minimum_length, df[length_column].max()+10, 10)
    df[f'{length_column}_bin'] = pd.cut(df[length_column], bins=bins)
    df['evidence_mean'] = df.groupby(f'{length_column}_bin')['avg_evidence_score'].transform('mean')
    df['evidence_adj'] = df['avg_evidence_score'] - df['evidence_mean']
    df['intuition_mean'] = df.groupby(f'{length_column}_bin')['avg_intuition_score'].transform('mean')
    df['intuition_adj'] = df['avg_intuition_score'] - df['intuition_mean']
    return df

def evidence_minus_intuition_score(df):
    df[['evidence_z', 'intuition_z']] = df[['evidence_adj', 'intuition_adj']].apply(zscore)
    df['evidence_minus_intuition_score'] = df['evidence_z'] - df['intuition_z']
    return df

def main(args):
    tqdm.pandas()
    delimiter = '\t' if args.tab_delimiter else None
    if args.smoke_test:
        df = pd.read_csv(args.input_file, nrows=500_000, compression=args.compression_type, delimiter=delimiter, dtype={'speech_id':object})
    else:
        df = pd.read_csv(args.input_file, compression=args.compression_type, delimiter=delimiter, dtype={'speech_id':object})
    if args.text_column != 'text':
        df.rename(columns = {args.text_column:'text'}, inplace = True)
    df['text'] = df['text'].astype(str)
    df = df.drop_duplicates(subset=['text']+[f'{args.id_column}'])
    try:
        df = df.drop(['speechTokens'], axis=1)
    except:
        pass
    df = preprocess(df, args)
    print('After pre-processing:', len(df[f'{args.id_column}'].unique()))
    
    custom_model = create_custom_model(args.model_name_or_path)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    evidence_sim = torch.Tensor().to(torch.device("cuda") if torch.cuda.is_available() else "cpu")
    intuition_sim = torch.Tensor().to(torch.device("cuda") if torch.cuda.is_available() else "cpu")
    chunk_size = 500_000
    list_df = [df[idx:idx+chunk_size] for idx in range(0, len(df), chunk_size)]
    
    for batch in tqdm(list_df):
        batch_text = batch['text']
        batch_text = list(batch_text)
        text_embeddings = get_embeddings(batch_text, custom_model)       

        if args.save_embeddings:
            output_fn = args.output_file.replace(".csv", ".pkl")
            with open(output_fn, "wb") as fout:
                pickle.dump({'text': batch_text, 'embeddings': text_embeddings}, fout, protocol=pickle.HIGHEST_PROTOCOL)

        evidence_keywords = pd.read_csv(args.evidence_lexicon) 
        evidence_keywords = list(evidence_keywords['evidence'])  
        evidence_embeddings = get_embeddings(evidence_keywords, custom_model)
        evidence_embeddings = torch.mean(evidence_embeddings, dim=0).to(device)

        intuition_keywords = pd.read_csv(args.intuition_lexicon) 
        intuition_keywords = list(intuition_keywords['intuition'][0:38])  
        intuition_embeddings = get_embeddings(intuition_keywords, custom_model)
        intuition_embeddings = torch.mean(intuition_embeddings, dim=0).to(device)

        evidence_sim = torch.cat((evidence_sim, util.cos_sim(text_embeddings, evidence_embeddings)), 0)
        intuition_sim = torch.cat((intuition_sim, util.cos_sim(text_embeddings, intuition_embeddings)), 0)

    avg_evidence_score = np.average(evidence_sim.cpu().numpy(), axis=1)  
    avg_intuition_score = np.average(intuition_sim.cpu().numpy(), axis=1)  
    df['avg_evidence_score'] = avg_evidence_score
    df['avg_intuition_score'] = avg_intuition_score

    length_column = 'chunk_length' if args.chunk_text else 'length'
    df = length_adjustment_bin(df, length_column=length_column, minimum_length=args.length_threshold)
    df = evidence_minus_intuition_score(df) 
    print(df.evidence_minus_intuition_score.head())
    print(df.evidence_minus_intuition_score.tail())
    df.to_csv(args.output_file, index=False, compression=args.compression_type)

# Set up arguments manually
class Args:
    model_name_or_path = '/kaggle/input/word2vec-new/word2vec_new.model'
    input_file = '/kaggle/input/parliament2/speeches_clean.csv'
    output_file = 'speeches_new_emi.csv'
    evidence_lexicon = '/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv'
    intuition_lexicon = '/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv'
    save_embeddings = False
    smoke_test = False
    text_column = 'speechContent'
    compression_type = 'infer'
    length_threshold = 10
    tab_delimiter = False
    chunk_text = True
    min_chunk_length = 50
    max_chunk_length = 150
    id_column = "id"

args = Args()

# Run the main function
main(args)

# Set up arguments manually
class Args:
    model_name_or_path = '/kaggle/input/word2vec-new/word2vec_new.model'
    input_file = '/kaggle/input/parliament-old/merged_parliament_texts.csv'
    output_file = 'speeches_old_emi.csv'
    evidence_lexicon = '/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv'
    intuition_lexicon = '/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv'
    save_embeddings = False
    smoke_test = False
    text_column = 'sentence'
    compression_type = 'infer'
    length_threshold = 10
    tab_delimiter = False
    chunk_text = True
    min_chunk_length = 50
    max_chunk_length = 150
    id_column = "id"
​
args = Args()
​
# Run the main function
main(args)

In [3]:
# Set up arguments manually
class Args:
    model_name_or_path = '/kaggle/input/word2vec-new/word2vec_new.model'
    input_file = '/kaggle/input/german-parliament/speeches_all.csv'
    output_file = 'speeches_all_emi.csv'
    evidence_lexicon = '/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv'
    intuition_lexicon = '/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv'
    save_embeddings = False
    smoke_test = False
    text_column = 'sentence'
    compression_type = 'infer'
    length_threshold = 10
    tab_delimiter = False
    chunk_text = True
    min_chunk_length = 50
    max_chunk_length = 150
    id_column = "id"

args = Args()

# Run the main function
main(args)

  df = pd.read_csv(args.input_file, compression=args.compression_type, delimiter=delimiter, dtype={'speech_id':object})
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].replace(to_replace=r"\.\.+", value=" ", regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].replace(to_replace=r"\-\

Average Speech length: [52.809412649791916]


100%|██████████| 5388642/5388642 [13:57<00:00, 6433.89it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens_top100'] = df.text.progress_apply(count_top100)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fraction_top100'] = df.tokens_top100 / df.length


Sample of texts with low fraction of top 100 words: [' Anträge Nr 1273 1274 1276 1280 282 1284 berichtigt 1285 1289 290 1298 1308 1344 1363 1374 375', '89  Art 90  Art 91  Art 92  Art 93  Art 94  Art 95  Kap II Abschnitt I Art 96  Art 97  Art 98 Art 99  Art 100  Art 101  Art 102 Art', 'äroit äoxtsr ää Isur wujorits xonr 1a nationalits äs Isur süoix', 'Drinks Frau Kühler Pohlmann Schiffer Wein hausen Müller Breslau Schulz Ostpr Dr Braun Franken Dr Pfeiffer', 'Ja Ja Ja Ja Ja Ja Ja Ja Ja Ja Ja ', 'Hitze Schiffer Schirmer Trimborn Dombek Nowicki Hirsch Jckler Marquart Schwabach Haehnle vr', 'jenigen VV ee rr ww aa II tt uu nn gg', '4  5  6  7  8  9 io  11 12  13  14  15 16  17  18  19  Angenommen', 'Städtische KaiserinAugusteViktoria Schule beschlagnahmt Klassenzahl 19 Schülerzahl 542 Neu Staatliches Lehrerseminar beschlagnahmt', '33  34  35  36  37  38  38a  38 d  38  38 ä  Angenommen']
Number of rows after top 100 words filtering: 5385600


100%|██████████| 5385600/5385600 [00:55<00:00, 97093.25it/s]
100%|██████████| 6165076/6165076 [00:36<00:00, 169653.36it/s]


After pre-processing: 354121


  0%|          | 0/13 [00:00<?, ?it/s]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  8%|▊         | 1/13 [00:25<05:00, 25.02s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 15%|█▌        | 2/13 [00:49<04:34, 24.98s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 23%|██▎       | 3/13 [01:14<04:08, 24.85s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 31%|███       | 4/13 [01:39<03:44, 24.95s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 38%|███▊      | 5/13 [02:05<03:20, 25.10s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 46%|████▌     | 6/13 [02:30<02:57, 25.33s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 54%|█████▍    | 7/13 [02:55<02:31, 25.20s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 62%|██████▏   | 8/13 [03:21<02:06, 25.33s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 69%|██████▉   | 9/13 [03:46<01:41, 25.34s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 77%|███████▋  | 10/13 [04:12<01:16, 25.44s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 85%|████████▍ | 11/13 [05:36<01:26, 43.41s/it]

Batches:   0%|          | 0/489 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 92%|█████████▏| 12/13 [07:12<00:59, 59.46s/it]

Batches:   0%|          | 0/162 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 13/13 [07:44<00:00, 35.75s/it]
  df['evidence_mean'] = df.groupby(f'{length_column}_bin')['avg_evidence_score'].transform('mean')
  df['intuition_mean'] = df.groupby(f'{length_column}_bin')['avg_intuition_score'].transform('mean')


0    1.386626
1   -0.575403
2   -1.105320
3   -0.138440
4   -0.392951
Name: evidence_minus_intuition_score, dtype: float32
6165659   -1.325621
6165660   -0.639145
6165661   -1.652401
6165662   -0.441201
6165663   -1.337247
Name: evidence_minus_intuition_score, dtype: float32
