In [1]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import json
import random
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
import re
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [2]:
corpus_file = 'knesset_corpus.jsonl'
punctuations = '",./<>?;:\'[]{}\\|`~!@#$%^&*()-_=+'
tokenized_sentences = []

# Load the corpus into a list of tokenized sentences (list of lists)
with open(corpus_file, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        sentence_text = data.get('sentence_text', '')
        if sentence_text:
            tokenized_sentence = [
                ''.join(char for char in word if char not in punctuations)      # Remove punctuation
                for word in sentence_text.split() if not word.isdigit()         # Split by spaces + token is not a number
            ]
            tokenized_sentences.append([word for word in tokenized_sentence if word])

In [3]:
# Fit a model using the sentences we tokenized and save it
model = Word2Vec(sentences=tokenized_sentences, vector_size=75, window=5, min_count=1, workers=4)
model.save("knesset_word2vec.model")

In [4]:
word_vectors = model.wv
print(word_vectors['ישראל'])

[-1.169255   -0.16793862 -2.0645444  -1.804288   -1.3457613   0.64175487
 -2.0476382   0.25816572  1.2085469  -0.503395    0.5615394   0.41585433
  4.2621164   2.3613327   2.3503318   1.3850614  -2.84321    -5.2625837
  5.4438844   0.4587705  -5.7069845  -4.0692124  -2.0300946   0.24550441
  1.0338215   1.4490489  -5.5016985  -1.462781   -2.3541327  -0.56446946
  2.6635463   1.3847193  -0.434488   -3.8004498  -0.7828321   1.9544717
  1.5978425   2.8863485  -4.243268   -3.8361502   1.0992993   0.24070765
  0.01129832  2.3179395  -1.6477921   0.7794569  -0.70139813 -3.9775765
 -0.92505395  5.6658187   3.2278452   1.9135427   2.2281106  -2.7181847
 -0.29920474 -2.4613624   0.4050948  -0.16124117  0.02799423 -3.6544554
  7.354597    2.6827607  -1.7322657  -1.0658965   3.650017   -3.4236054
  2.962219    2.0569234   4.4817      3.7506545  -2.7776325  -0.01020103
 -0.6548142  -3.4621594  -0.63355035]


1. Increasing the vector size allows us to capture more information about each word as it has more dimensionality.
2. PROBLEMS IN THIS SPECIFIC CORPUS

In [5]:
words_to_check = ['ישראל', 'גברת', 'ממשלה', 'חבר', 'בוקר', 'מים', 'אסור', 'רשות', 'זכויות']

# We decided to utilize a dictionary to store the most similar words for each word
similar_words = {}

# Find the most similar words for each word
for word in words_to_check:
    if word in word_vectors:
        # Calculate similarity between the target word and all other words in the vocabulary
        similarity_scores = {other_word: word_vectors.similarity(word, other_word)
                             for other_word in word_vectors.index_to_key if other_word != word}
        
        # Sort by the scores and pick the top 5
        most_similar = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)[:5]
        similar_words[word] = most_similar

with open('knesset_similar_words.txt', 'w', encoding='utf-8') as f:
    for word, similar in similar_words.items():
        # Format the top 5 for the current word
        similar_str = ', '.join([f"({sim_word}, {score:.4f})" for sim_word, score in similar])
        f.write(f"{word}: {similar_str}\n")

In [6]:
def average_sentence_embedding(sentence):
    # Get the word embedding vectors for the sentence
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

sentence_embeddings = []
for sentence in tokenized_sentences:
    sentence_embeddings.append(average_sentence_embedding(sentence))

In [8]:
# The sentences we hand picked for this task, they seem to have a good structure
# and it is possible to infer their meanings, they were also not too long and not too short
selected_indices = [18,     22,     94,     3142,   3220, 
                    9277,   62100,  79100,  93123,  101293]

# Calculate cosine similarity between sentence embeddings
similar_sentences = {}
for idx in selected_indices:
    current_embedding = sentence_embeddings[idx].reshape(1, -1)
    similarities = cosine_similarity(current_embedding, sentence_embeddings)
    most_similar_idx = similarities.argsort()[0][-2]
    similar_sentences[idx] = most_similar_idx

# Load the original sentences with punctuations
original_sentences = []
with open(corpus_file, 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        original_sentences.append(data.get('sentence_text', ''))

with open('knesset_similar_sentences.txt', 'w', encoding='utf-8') as f:
    for idx, similar_idx in similar_sentences.items():
        current_sentence = original_sentences[idx]
        similar_sentence = original_sentences[similar_idx]
        f.write(f"{current_sentence}: most similar sentence: {similar_sentence}\n")

### STOPPED HERE

In [9]:
sentences_to_check = {
    r'בעוד מספר דקות נתחיל את הדיון בנושא השבת החטופים .': ['דקות','הדיון'],
    r'בתור יושבת ראש הוועדה , אני מוכנה להאריך את ההסכם באותם תנאים .': ['הוועדה','אני', 'ההסכם'],
    r'בוקר טוב , אני פותח את הישיבה .': ['בוקר', 'פותח'],
    r'שלום , אנחנו שמחים להודיע שחברינו ה יקר קיבל קידום .': ['שלום', 'שמחים', 'היקר','קידום'],
    r'אין מניעה להמשיך לעסוק ב נושא .': ['מניעה']
}

prompts = {
    r'בעוד מספר דקות נתחיל את הדיון בנושא השבת החטופים .': ['רבים','הרבה', 'מספר'],
    r'בתור יושבת ראש הוועדה , אני מוכנה להאריך את ההסכם באותם תנאים .': ['הראש','היושבת'],
    r'בוקר טוב , אני פותח את הישיבה .': ['זמן'],
    r'שלום , אנחנו שמחים להודיע שחברינו ה יקר קיבל קידום .': ['פותח'],
    r'אין מניעה להמשיך לעסוק ב נושא .': []
}

def replace_with_similar_words(sentence_dict, model):
    replaced_sentences = []
    for sentence, words in sentence_dict.items():
        new_sentence = sentence
        for word in words:
            if word in model.wv:
                similar_word = model.wv.most_similar(word, topn=1)[0][0]
                new_sentence = new_sentence.replace(word, similar_word)
        replaced_sentences.append(new_sentence)
    return replaced_sentences

    # Replace words in the sentences
replaced_sentences_with_prompts = replace_with_similar_words(sentences_to_check, model)
for sentence in replaced_sentences_with_prompts:
    print(sentence)

בעוד מספר הדקות נתחיל את הנושא בנושא השבת החטופים .
בתור יושבת ראש הקואליציה , ואני מוכנה להאריך את המסמך באותם תנאים .
ולי טוב , אני מפנה את הישיבה .
שמחון , אנחנו משתדלים להודיע שחברינו ה יקר קיבל באגף .
אין אינטגרציה להמשיך לעסוק ב נושא .


### KNN Classifier

Copied code from HW3

In [10]:
class Speaker:
    def __init__(self, file_path, speaker_name=None):
        self.name = speaker_name
        self.df = pd.read_json(file_path, lines=True)
        if speaker_name:
            self.df = self.df[self.df['speaker_name'].apply(self._matches_speaker_name)]
    
    def _matches_speaker_name(self, name_in_data):
        if not self.name:
            return False
        
        name_parts = self.name.split()
        data_parts = name_in_data.split()
        
        # Handle case where name has more than 4 components
        if len(name_parts) > 4 or len(data_parts) > 4:
            return False
        
        # Iterate over all parts of the provided name
        for i, part in enumerate(name_parts):
            if i >= len(data_parts):  # Dataset name has fewer parts
                return False
            
            # If it's an initial, match with any name starting with the same letter
            if re.fullmatch(rf"{re.escape(part[0])}['\"׳`]?", part):
                if not data_parts[i].startswith(part[0]):
                    return False
            # If it's a full name, ensure it matches fully
            elif part != data_parts[i]:
                return False
        return True

corpus_path = 'knesset_corpus.jsonl'
df = pd.read_json(corpus_path, lines=True)

def get_most_frequent_speakers(df):
    speaker_counts = df['speaker_name'].value_counts()
    most_frequent_speaker = speaker_counts.idxmax()
    second_most_frequent_speaker = speaker_counts.index[1] if len(speaker_counts) > 1 else None
    return most_frequent_speaker, second_most_frequent_speaker

most_frequent_speaker, second_most_frequent_speaker = get_most_frequent_speakers(df)

# Binary classification task
speaker1Bin = Speaker(corpus_path, most_frequent_speaker)
speaker2Bin = Speaker(corpus_path, second_most_frequent_speaker)

def balance_dataframes(df1, df2):
    min_len = min(len(df1), len(df2))
    return df1.sample(min_len), df2.sample(min_len)

speaker1Bin.df, speaker2Bin.df = balance_dataframes(speaker1Bin.df, speaker2Bin.df)

In [11]:
# Prepare the data for KNN classifier using word2vec embeddings
def get_sentence_embedding(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Use the sentence embeddings we created
sentence_embeddings = np.array(sentence_embeddings)
labels = np.array([1 if label == most_frequent_speaker else 0 for label in df['speaker_name']])

# Train the KNN classifier using cross-validation
classifier = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(classifier, sentence_embeddings, labels, cv=5, scoring='accuracy')
print(f"Cross-validated accuracy scores: {scores}")
print(f"Mean accuracy: {scores.mean() * 100:.2f}%")

Cross-validated accuracy scores: [0.95883898 0.96753159 0.96834669 0.97038446 0.97065616]
Mean accuracy: 96.72%


In [12]:
print(classification_report(labels, classifier.fit(sentence_embeddings, labels).predict(sentence_embeddings)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99    107270
           1       0.70      0.12      0.20      3146

    accuracy                           0.97    110416
   macro avg       0.84      0.56      0.60    110416
weighted avg       0.97      0.97      0.96    110416



### Bert

In [13]:
original_sentences = []
with open('original_sampled_sents.txt', 'r', encoding='utf-8') as f:
    for line in f:
        original_sentences.append(line)

masked_sentences = []
with open('masked_sampled_sents.txt', 'r', encoding='utf-8') as f:
    for line in f:
        masked_sentences.append(line.replace('*', 'MASK').strip())

tokenized_masked_sentences = [sentence.split() for sentence in masked_sentences]

In [14]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert')
model = AutoModelForMaskedLM.from_pretrained('dicta-il/dictabert')

model.eval()

for sentence in masked_sentences:
    tokenized_sentence = sentence.split()
    for i, token in enumerate(tokenized_sentence):
        if token == '[MASK]':
            output = model(tokenizer.encode(' '.join(tokenized_sentence[:i]), return_tensors='pt'))
            most_similar = torch.topk(output.logits[0, i, :], 1)[1]
            tokenized_sentence[i] = tokenizer.convert_ids_to_tokens(most_similar)[0]
    total_sentence = ' '.join(tokenized_sentence)
    print(total_sentence)

  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


. מדבר על מקרה שאתה יכול להשעות אותו עד שיחכו .
. אל תיתן 1 , 000 שקל יותר .
הם הם שיש להם תאריך , והם הצביעו על השלבים שהם שהם לעבור עד סוף יוני , כדי שלשיחות היזומות יהיה פתרון סוחף .
. , זה היה שימוש לרעה .
אם אתם אתם לדון כאופוזיציה - - -
התקציב קוצץ מ - - ל - 15 מיליון , למען הדיוק .
אנחנו מסתכלים על זה מעט אחרת , אנחנו מקבלים מקבלים ומתאימים לו מענה .
העובדים שעובדים היום במתנ " ס , נמצא , מנהל המתנ " ס , ס היה גם קודם לכן , יכול להעיד מקבלים את שכרם ב - 1 לחודש , לא ב - 10 - לא ב - 09 - ולא ב - 20 לחודש 20 כך כמו שהיה קודם לכן , מקבלים ב - 1 לחודש את לחודש .
אף אף עדיין לא חותם .
זאת אידיאולוגיה של הפרטה הפרטה .
