In [1]:
!pip install wordfreq
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from gensim.models import Word2Vec
from scipy.stats import zscore
from wordfreq import top_n_list

Collecting wordfreq
  Downloading wordfreq-3.1.1-py3-none-any.whl.metadata (27 kB)
Collecting ftfy>=6.1 (from wordfreq)
  Downloading ftfy-6.2.0-py3-none-any.whl.metadata (7.3 kB)
Collecting locate<2.0.0,>=1.1.1 (from wordfreq)
  Downloading locate-1.1.1-py3-none-any.whl.metadata (3.9 kB)
Downloading wordfreq-3.1.1-py3-none-any.whl (56.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hDownloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading locate-1.1.1-py3-none-any.whl (5.4 kB)
[0mInstalling collected packages: locate, ftfy, wordfreq
Successfully installed ftfy-6.2.0 locate-1.1.1 wordfreq-3.1.1


In [2]:
# Example: Load the model and find similar words
model = Word2Vec.load("/kaggle/input/word2vec-new/word2vec_new.model")


In [3]:
model

<gensim.models.word2vec.Word2Vec at 0x7845fc07dcf0>

In [4]:
df_dicts = pd.read_csv("/kaggle/input/dictionary2/PRODEMINFO_German_keywords.csv")
df_new = pd.read_csv("/kaggle/input/parliament2/speeches_new.csv")
df_old = pd.read_csv("/kaggle/input/parliament-old/speeches_old.csv")
df_old.rename(columns={"sentence": "speechContent"}, inplace=True)

In [6]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10410472 entries, 0 to 10410471
Data columns (total 8 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   id             int64 
 1   begin          int64 
 2   end            int64 
 3   speechContent  object
 4   dateDay        int64 
 5   dateMonth      int64 
 6   dateYear       int64 
 7   timestamp      int64 
dtypes: int64(7), object(1)
memory usage: 635.4+ MB


In [8]:
# Ensure to use tqdm.pandas() to add the progress_apply method
tqdm.pandas()


# Function to preprocess the text data
def preprocess(df, length_threshold=10, chunk_text=False, min_chunk_length=50, max_chunk_length=150):
    
    def remove_special_characters(text):
        # Define a regular expression pattern that matches all non-alphanumeric characters except for ä, ö, ü
        pattern = r'[^a-zA-Z0-9äöüÄÖÜ\s]'
        # Replace special characters with an empty string
        clean_text = re.sub(pattern, '', text)
        return clean_text

    # Clean text
    df['speechContent'] = df["speechContent"].astype(str)
    df["speechContent"].replace(to_replace=r"\.\.+", value=" ", regex=True, inplace=True)
    df["speechContent"].replace(to_replace=r"\-\-+", value=" ", regex=True, inplace=True)
    df["speechContent"].replace(to_replace=r"__+", value=" ", regex=True, inplace=True)
    df["speechContent"].replace(to_replace=r"\*\*+", value=" ", regex=True, inplace=True)
    df["speechContent"].replace(to_replace=r"\s+", value=" ", regex=True, inplace=True)
    df["speechContent"] = df["speechContent"].progress_apply(remove_special_characters)
    
    df['length'] = df.speechContent.progress_apply(lambda x: len(x.split()))
    
    
    df = df[df['length'] > length_threshold]
    
    print(f"Average Speech length: {[df['length'].mean()]}")

    # Optional: chunk text into smaller parts
    if chunk_text:
        def chunk_by_length(x):
            words = x.split()
            if len(words) > max_chunk_length:
                chunks = [words[i:i+max_chunk_length] for i in range(0, len(words), max_chunk_length)]
                last_chunk_length = len(chunks[-1])
                if len(chunks) > 1 and last_chunk_length < min_chunk_length:
                    chunks[-2] = chunks[-2] + chunks[-1]
                    del chunks[-1]
                chunked = [" ".join(chunk) for chunk in chunks]
            else:
                chunked = [" ".join(words)]
            return chunked 

        df['speechContent'] = df.speechContent.progress_apply(chunk_by_length)
        df = df.explode("speechContent", ignore_index=True)    
        df = df.drop_duplicates(subset=['speechContent']+['id'])
        df['chunk_length'] = df.speechContent.progress_apply(lambda x: len(x.split()))
    
    return df


# Compute average word vectors for documents
def document_vector(model, tokens):
    vector = np.mean([model.wv[token] for token in tokens if token in model.wv], axis=0)
    return vector

def compute_similarity_scores(df, model, evidence_keywords, intuition_keywords):
    evidence_embeddings = np.mean([model.wv[word] for word in evidence_keywords if word in model.wv], axis=0)
    intuition_embeddings = np.mean([model.wv[word] for word in intuition_keywords if word in model.wv], axis=0)
    
    tqdm.pandas()
    
    df['document_vector'] = df['speechContent'].progress_apply(lambda tokens: document_vector(model, tokens))
    df['evidence_similarity'] = df['document_vector'].progress_apply(lambda vec: np.dot(vec, evidence_embeddings))
    df['intuition_similarity'] = df['document_vector'].progress_apply(lambda vec: np.dot(vec, intuition_embeddings))

    return df

# Adjust scores based on length
def length_adjustment_bin(df, length_column='length', minimum_length=10):
    bins = range(minimum_length, df[length_column].max()+10, 10)
    df[f'{length_column}_bin'] = pd.cut(df[length_column], bins=bins)
    df['evidence_mean'] = df.groupby(f'{length_column}_bin')['evidence_similarity'].transform('mean')
    df['evidence_adj'] = df['evidence_similarity'] - df['evidence_mean']
    df['intuition_mean'] = df.groupby(f'{length_column}_bin')['intuition_similarity'].transform('mean')
    df['intuition_adj'] = df['intuition_similarity'] - df['intuition_mean']
    return df

# Compute evidence minus intuition score
def evidence_minus_intuition_score(df):
    df[['evidence_z', 'intuition_z']] = df[['evidence_adj', 'intuition_adj']].progress_apply(zscore)
    df['evidence_minus_intuition_score'] = df['evidence_z'] - df['intuition_z']
    return df


In [7]:
# Preprocess the data
df_new = preprocess(df_new, chunk_text=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["speechContent"].replace(to_replace=r"\.\.+", value=" ", regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["speechContent"].replace(to_replace=r"\-\-+", value=" ", regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method 

Average Speech length: [429.8569969857363]


100%|██████████| 346685/346685 [00:20<00:00, 16952.71it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['speechContent'] = df.speechContent.progress_apply(chunk_by_length)
100%|██████████| 1126687/1126687 [00:14<00:00, 77659.27it/s]


In [9]:
df_old = preprocess(df_old, chunk_text=True)

df_old

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["speechContent"].replace(to_replace=r"\.\.+", value=" ", regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["speechContent"].replace(to_replace=r"\-\-+", value=" ", regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method 

Average Speech length: [26.875018508413177]


100%|██████████| 5045003/5045003 [00:41<00:00, 121792.30it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['speechContent'] = df.speechContent.progress_apply(chunk_by_length)
100%|██████████| 5041436/5041436 [00:26<00:00, 189415.92it/s]


Unnamed: 0,id,begin,end,speechContent,dateDay,dateMonth,dateYear,timestamp,length,chunk_length
0,121,221,327,33 Ergebni der Verloosung der Wahlakten an die...,3,3,1887,-2613945600000,13,13
1,145,347,456,Diskussion zur Geschäftsordnung betreffend die...,3,3,1887,-2613945600000,13,13
2,169,496,586,4 6 88 Staatssekretär des Innern Staatsministe...,3,3,1887,-2613945600000,11,11
3,241,706,831,1 6 77 von Kardorff 1 1 7 88 Feststellung der ...,3,3,1887,-2613945600000,17,17
4,277,890,982,Die Sitzung wird um 11 Uhr 35 Minuten durch de...,3,3,1887,-2613945600000,16,16
...,...,...,...,...,...,...,...,...,...,...
5045065,32353,266348,266524,Nunmehr hätten wir über den Antrag des Herrn A...,30,9,1929,-1270339200000,25,25
5045066,32413,266625,266750,Ich bitte die Damen und Herren die dem Antrag ...,30,9,1929,-1270339200000,22,22
5045067,32521,266885,266978,Der Herr Abgeordnete Graf v Westarp hat beantr...,30,9,1929,-1270339200000,16,16
5045068,32545,267008,267153,wenn wir morgen die zweite Lesung des Gesetzes...,30,9,1929,-1270339200000,24,24


In [None]:
# Define your evidence and intuition keywords
evidence_keywords = df_dicts.iloc[0:38]["intuition"].tolist()
intuition_keywords = df_dicts["evidence"].tolist()

# Compute similarity scores
df_new = compute_similarity_scores(df_new, model, evidence_keywords, intuition_keywords)

In [10]:
evidence_keywords = df_dicts.iloc[0:38]["intuition"].tolist()
intuition_keywords = df_dicts["evidence"].tolist()

df_old = compute_similarity_scores(df_old, model, evidence_keywords, intuition_keywords)

100%|██████████| 5041436/5041436 [1:01:21<00:00, 1369.54it/s]
100%|██████████| 5041436/5041436 [00:24<00:00, 201807.57it/s]
100%|██████████| 5041436/5041436 [00:24<00:00, 205203.99it/s]


In [12]:
# Adjust scores based on length
df_new = length_adjustment_bin(df_new)

# Compute evidence minus intuition score
df_new = evidence_minus_intuition_score(df_new)

df_new.to_csv('speeches_new_emi.csv', index=False)

  df['evidence_mean'] = df.groupby(f'{length_column}_bin')['evidence_similarity'].transform('mean')
  df['intuition_mean'] = df.groupby(f'{length_column}_bin')['intuition_similarity'].transform('mean')
100%|██████████| 2/2 [00:00<00:00, 59.29it/s]


2


In [11]:
df_new

Unnamed: 0,id,session,electoralTerm,firstName,lastName,politicianId,speechContent,factionId,documentUrl,positionShort,positionLong,date,length,chunk_length,document_vector,evidence_similarity,intuition_similarity
0,11,4,1,,blücher,11000202,Herr Präsident meine Damen und Herren Es ist m...,13,https://dip21.bundestag.de/dip21/btp/01/01004.pdf,Member of Parliament,,1949-09-15,225,150,"[0.8367558, 0.57555866, -0.24100341, 0.5972533...",-8.102803,-13.485657
1,11,4,1,,blücher,11000202,sich um Entscheidungen handelt die letzten End...,13,https://dip21.bundestag.de/dip21/btp/01/01004.pdf,Member of Parliament,,1949-09-15,225,75,"[0.7974793, 0.50656307, -0.14949527, 0.5590210...",-8.514908,-12.596498
2,13,4,1,,schmid,11001993,Meine Damen und Herren ich habe den Eindruck d...,23,https://dip21.bundestag.de/dip21/btp/01/01004.pdf,Member of Parliament,,1949-09-15,351,150,"[0.8814791, 0.6249713, -0.26745096, 0.6937281,...",-8.070464,-12.601604
3,13,4,1,,schmid,11001993,einen sehr triftigen Grund wir werden für die ...,23,https://dip21.bundestag.de/dip21/btp/01/01004.pdf,Member of Parliament,,1949-09-15,351,150,"[0.87738633, 0.519609, -0.23963648, 0.6217491,...",-7.873611,-13.264590
4,13,4,1,,schmid,11001993,erst am Dienstag verlesen bekommen verlieren w...,23,https://dip21.bundestag.de/dip21/btp/01/01004.pdf,Member of Parliament,,1949-09-15,351,51,"[0.9299589, 0.647783, -0.39267546, 0.7516034, ...",-6.589839,-11.998529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126682,1060955,228,19,Helge,Lindh,11004802,Gut ich beschränke mich auf einen Satz Ich pas...,23,https://dip21.bundestag.de/dip21/btp/19/19228.pdf,Member of Parliament,,2021-05-07,107,107,"[0.9366629, 0.7509955, -0.33833253, 0.69938385...",-8.319621,-13.256542
1126683,1060957,228,19,Tankred,Schipanski,11004143,Vielen Dank Frau Präsidentin Liebe Kolleginnen...,4,https://dip21.bundestag.de/dip21/btp/19/19228.pdf,Member of Parliament,,2021-05-07,646,150,"[0.8032122, 0.5487522, -0.28793874, 0.6605829,...",-7.550496,-12.381764
1126684,1060957,228,19,Tankred,Schipanski,11004143,von Meinungsfreiheit und Demokratie Philipp Am...,4,https://dip21.bundestag.de/dip21/btp/19/19228.pdf,Member of Parliament,,2021-05-07,646,150,"[0.81657255, 0.64838105, -0.25032926, 0.650378...",-8.090214,-12.540377
1126685,1060957,228,19,Tankred,Schipanski,11004143,beides ist ausbaufähig die Beispiele von Anke ...,4,https://dip21.bundestag.de/dip21/btp/19/19228.pdf,Member of Parliament,,2021-05-07,646,150,"[0.823944, 0.54016495, -0.22172444, 0.6115701,...",-8.374198,-12.955227


In [None]:
# Adjust scores based on length
df_old = length_adjustment_bin(df_old)

# Compute evidence minus intuition score
df_old = evidence_minus_intuition_score(df_old)

df_old.to_csv('speeches_old_emi.csv', index=False)

  df['evidence_mean'] = df.groupby(f'{length_column}_bin')['evidence_similarity'].transform('mean')
  df['intuition_mean'] = df.groupby(f'{length_column}_bin')['intuition_similarity'].transform('mean')
100%|██████████| 2/2 [00:00<00:00, 17.57it/s]


In [2]:
from IPython.display import FileLink
FileLink(r'speeches_old_emi.csv')