In [1]:
!pip install wordfreq
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from gensim.models import Word2Vec
from scipy.stats import zscore
from wordfreq import top_n_list
from sklearn.metrics.pairwise import cosine_similarity

Collecting wordfreq
  Downloading wordfreq-3.1.1-py3-none-any.whl.metadata (27 kB)
Collecting ftfy>=6.1 (from wordfreq)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Collecting locate<2.0.0,>=1.1.1 (from wordfreq)
  Downloading locate-1.1.1-py3-none-any.whl.metadata (3.9 kB)
Downloading wordfreq-3.1.1-py3-none-any.whl (56.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading locate-1.1.1-py3-none-any.whl (5.4 kB)
Installing collected packages: locate, ftfy, wordfreq
Successfully installed ftfy-6.2.0 locate-1.1.1 wordfreq-3.1.1


In [2]:
# Example: Load the model and find similar words
model = Word2Vec.load("/kaggle/input/word2vec-new/word2vec_new.model")

In [3]:
df_dicts = pd.read_csv("/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv")
df = pd.read_csv("/kaggle/input/german-parliament/speeches_all.csv")
df.rename(columns={"sentence": "speechContent"}, inplace=True)

  df = pd.read_csv("/kaggle/input/german-parliament/speeches_all.csv")


In [4]:
# Define your evidence and intuition keywords
evidence_keywords = df_dicts["evidence"].tolist()
intuition_keywords = df_dicts.iloc[0:38]["intuition"].tolist()

In [5]:
# Ensure to use tqdm.pandas() to add the progress_apply method
tqdm.pandas()


# Function to preprocess the text data
def preprocess(df, length_threshold=10, chunk_text=False, min_chunk_length=50, max_chunk_length=150):
    
    def remove_special_characters(text):
        # Define a regular expression pattern that matches all non-alphanumeric characters except for ä, ö, ü
        pattern = r'[^a-zA-Z0-9äöüÄÖÜ\s]'
        # Replace special characters with an empty string
        clean_text = re.sub(pattern, '', text)
        return clean_text

    # Clean text
    df['speechContent'] = df["speechContent"].astype(str)
    df["speechContent"].replace(to_replace=r"\.\.+", value=" ", regex=True, inplace=True)
    df["speechContent"].replace(to_replace=r"\-\-+", value=" ", regex=True, inplace=True)
    df["speechContent"].replace(to_replace=r"__+", value=" ", regex=True, inplace=True)
    df["speechContent"].replace(to_replace=r"\*\*+", value=" ", regex=True, inplace=True)
    df["speechContent"].replace(to_replace=r"\s+", value=" ", regex=True, inplace=True)
    df["speechContent"] = df["speechContent"].progress_apply(remove_special_characters)
    
    df['length'] = df.speechContent.progress_apply(lambda x: len(x.split()))
    
    
    df = df[df['length'] > length_threshold]
    
    print(f"Average Speech length: {[df['length'].mean()]}")

    # Optional: chunk text into smaller parts
    if chunk_text:
        def chunk_by_length(x):
            words = x.split()
            if len(words) > max_chunk_length:
                chunks = [words[i:i+max_chunk_length] for i in range(0, len(words), max_chunk_length)]
                last_chunk_length = len(chunks[-1])
                if len(chunks) > 1 and last_chunk_length < min_chunk_length:
                    chunks[-2] = chunks[-2] + chunks[-1]
                    del chunks[-1]
                chunked = [" ".join(chunk) for chunk in chunks]
            else:
                chunked = [" ".join(words)]
            return chunked 

        df['speechContent'] = df.speechContent.progress_apply(chunk_by_length)
        df = df.explode("speechContent", ignore_index=True)    
        df = df.drop_duplicates(subset=['speechContent']+['id'])
        df['chunk_length'] = df.speechContent.progress_apply(lambda x: len(x.split()))
    
    return df


# Compute average word vectors for documents
def document_vector(model, tokens):
    vector = np.mean([model.wv[token] for token in tokens if token in model.wv], axis=0)
    return vector


def compute_similarity_scores(df, model, evidence_keywords, intuition_keywords):
    evidence_embeddings = np.mean([model.wv[word] for word in evidence_keywords if word in model.wv], axis=0).reshape(1, -1)
    intuition_embeddings = np.mean([model.wv[word] for word in intuition_keywords if word in model.wv], axis=0).reshape(1, -1)
    
    tqdm.pandas()
    
    df['document_vector'] = df['speechContent'].progress_apply(lambda tokens: document_vector(model, tokens).reshape(1, -1))
    df['document_vector'] = df['document_vector'].apply(lambda vec: vec.reshape(1, -1))
    
    df['evidence_similarity'] = df['document_vector'].apply(lambda vec: cosine_similarity(vec, evidence_embeddings)[0][0])
    df['intuition_similarity'] = df['document_vector'].apply(lambda vec: cosine_similarity(vec, intuition_embeddings)[0][0])

    return df

# Adjust scores based on length
def length_adjustment_bin(df, length_column='chunk_length', minimum_length=10):
    bins = range(minimum_length, df[length_column].max()+10, 10)
    df[f'{length_column}_bin'] = pd.cut(df[length_column], bins=bins)
    df['evidence_mean'] = df.groupby(f'{length_column}_bin')['evidence_similarity'].transform('mean')
    df['evidence_adj'] = df['evidence_similarity'] - df['evidence_mean']
    df['intuition_mean'] = df.groupby(f'{length_column}_bin')['intuition_similarity'].transform('mean')
    df['intuition_adj'] = df['intuition_similarity'] - df['intuition_mean']
    return df

# Compute evidence minus intuition score
def evidence_minus_intuition_score(df):
    df[['evidence_z', 'intuition_z']] = df[['evidence_adj', 'intuition_adj']].progress_apply(zscore)
    df['evidence_minus_intuition_score'] = df['evidence_z'] - df['intuition_z']
    return df


In [6]:
df = preprocess(df, chunk_text=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["speechContent"].replace(to_replace=r"\.\.+", value=" ", regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["speechContent"].replace(to_replace=r"\-\-+", value=" ", regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method 

Average Speech length: [52.78671948376835]


100%|██████████| 5391688/5391688 [00:53<00:00, 100280.46it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['speechContent'] = df.speechContent.progress_apply(chunk_by_length)
100%|██████████| 6168123/6168123 [00:36<00:00, 169272.95it/s]


In [7]:
df = compute_similarity_scores(df, model, evidence_keywords, intuition_keywords)

100%|██████████| 6168123/6168123 [1:19:51<00:00, 1287.22it/s]


In [8]:
# Adjust scores based on length
df = length_adjustment_bin(df)

# Compute evidence minus intuition score
df = evidence_minus_intuition_score(df)

df = df.drop(['document_vector'], axis=1)


df.to_csv('speeches_all_w2v_emi.csv', index=False)

  df['evidence_mean'] = df.groupby(f'{length_column}_bin')['evidence_similarity'].transform('mean')
  df['intuition_mean'] = df.groupby(f'{length_column}_bin')['intuition_similarity'].transform('mean')
100%|██████████| 2/2 [00:00<00:00, 16.43it/s]
