In [1]:
import pandas as pd
import numpy as np
import re
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.sparse import vstack

random.seed(42)
np.random.seed(42)

### Step 1 - Determining Most Common Speakers

In [2]:
class Speaker:
    def __init__(self, file_path, speaker_name=None):
        self.name = speaker_name
        self.df = pd.read_json(file_path, lines=True)
        if speaker_name:
            self.df = self.df[self.df['speaker_name'].apply(self._matches_speaker_name)]
    
    def _matches_speaker_name(self, name_in_data):
        if not self.name:
            return False
        
        name_parts = self.name.split()
        data_parts = name_in_data.split()
        
        # Handle case where name has more than 4 components
        if len(name_parts) > 4 or len(data_parts) > 4:
            return False
        
        # Iterate over all parts of the provided name
        for i, part in enumerate(name_parts):
            if i >= len(data_parts):  # Dataset name has fewer parts
                return False
            
            # If it's an initial, match with any name starting with the same letter
            if re.fullmatch(rf"{re.escape(part[0])}['\"׳`]?", part):
                if not data_parts[i].startswith(part[0]):
                    return False
            # If it's a full name, ensure it matches fully
            elif part != data_parts[i]:
                return False
        return True

corpus_path = 'knesset_corpus.jsonl'
df = pd.read_json(corpus_path, lines=True)

#### Binary

In [None]:
def get_most_frequent_speakers(df):
    speaker_counts = df['speaker_name'].value_counts()
    most_frequent_speaker = speaker_counts.idxmax()
    second_most_frequent_speaker = speaker_counts.index[1] if len(speaker_counts) > 1 else None
    return most_frequent_speaker, second_most_frequent_speaker

most_frequent_speaker, second_most_frequent_speaker = get_most_frequent_speakers(df)

# Binary classification task
speaker1Bin = Speaker(corpus_path, most_frequent_speaker)
speaker2Bin = Speaker(corpus_path, second_most_frequent_speaker)

print(f"Speaker 1: {most_frequent_speaker} ({len(speaker1Bin.df)} speeches)")
print(f"Speaker 2: {second_most_frequent_speaker} ({len(speaker2Bin.df)} speeches)")

#### Multi-Class

In [None]:
# Multi-class classification task
speaker1Mult = Speaker(corpus_path, most_frequent_speaker)
speaker2Mult = Speaker(corpus_path, second_most_frequent_speaker)

# setting 'other' class
speaker3_df = pd.read_json(corpus_path, lines=True)
speaker3_df = speaker3_df[~speaker3_df['speaker_name'].isin([most_frequent_speaker, second_most_frequent_speaker])]
speakerOtherMult = Speaker(corpus_path)
speakerOtherMult.df = speaker3_df

print(f"Speaker 1: {most_frequent_speaker} ({len(speaker1Mult.df)} speeches)")
print(f"Speaker 2: {second_most_frequent_speaker} ({len(speaker2Mult.df)} speeches)")
print(f"Speaker 3: Other ({len(speakerOtherMult.df)} speeches)")

### Step 2 - Balance Dataframes

#### Binary

In [5]:
def balance_dataframes(df1, df2):
    min_len = min(len(df1), len(df2))
    return df1.sample(min_len), df2.sample(min_len)

speaker1Bin.df, speaker2Bin.df = balance_dataframes(speaker1Bin.df, speaker2Bin.df)

#### Multi-Class

In [6]:
def balance_three_dataframes(df1, df2, df3):
    min_len = min(len(df1), len(df2), len(df3))
    return df1.sample(min_len), df2.sample(min_len), df3.sample(min_len)

speaker1Mult.df, speaker2Mult.df, speakerOtherMult.df = balance_three_dataframes(speaker1Mult.df, speaker2Mult.df, speakerOtherMult.df)

### Step 3 - Vectorization

In [7]:
combined_sentences_df = pd.concat([
    speaker1Bin.df['sentence_text'], 
    speaker2Bin.df['sentence_text'], 
    speakerOtherMult.df['sentence_text']
], ignore_index=True)

#### Bag of Words

In [None]:
# initialize vectorizers with the same vocabulary for all tasks
vectorizer_bow = CountVectorizer()
vectorizer_bow.fit(combined_sentences_df)

In [None]:
def create_bag_of_words(df, vectorizer=None):
    bow_matrix = vectorizer.transform(df['sentence_text'])
    return bow_matrix, vectorizer.get_feature_names_out()

# Create Bag of Words
bow_speaker1_bin, feature_names1_bin = create_bag_of_words(speaker1Bin.df, vectorizer_bow)
bow_speaker2_bin, feature_names2_bin = create_bag_of_words(speaker2Bin.df, vectorizer_bow)
bow_speaker_other, feature_names_other = create_bag_of_words(speakerOtherMult.df, vectorizer_bow)

# Print shapes of the matrices
print("Bag of Words:")
print(f"Speaker 1: {bow_speaker1_bin.shape}")
print(f"Speaker 2: {bow_speaker2_bin.shape}")
print(f"Other: {bow_speaker_other.shape}\n")

#### Custom Features

In [None]:
# initialize vectorizers with the same vocabulary for all tasks
vectorizer_custom = CountVectorizer(ngram_range=(2, 5))
vectorizer_custom.fit(combined_sentences_df)

In [None]:
def create_ngram_bag(df, vectorizer=None):
    bow_matrix = vectorizer.transform(df['sentence_text'])
    return bow_matrix, vectorizer.get_feature_names_out()

# Create ngram bags
custom_speaker1_bin, custom_feature_names1_bin = create_ngram_bag(speaker1Bin.df, vectorizer_custom)
custom_speaker2_bin, custom_feature_names2_bin = create_ngram_bag(speaker2Bin.df, vectorizer_custom)
custom_speaker_other_bin, custom_feature_names_other_bin = create_ngram_bag(speakerOtherMult.df, vectorizer_custom)

# Print shapes of the matrices
print("Custom Vector:")
print(f"Speaker 1: {custom_speaker1_bin.shape}")
print(f"Speaker 2: {custom_speaker2_bin.shape}")
print(f"Other: {custom_speaker_other_bin.shape}\n")

### Step 4 - Training

#### Bag of Words

In [12]:
def train_BoW(bow_matrix, labels, classifier_type='logistic'):
    if classifier_type == 'logistic':
        classifier = LogisticRegression(max_iter=1000, random_state=42)
    elif classifier_type == 'knn':
        classifier = KNeighborsClassifier(n_neighbors=5)
    
    # 5-fold cross-validation
    scores = cross_val_score(classifier, bow_matrix, labels, cv=5, scoring='accuracy')
    
    print(f"{classifier_type} model 5-fold Cross-Validation Accuracy: {np.mean(scores) * 100:.2f}%")
    
    classifier.fit(bow_matrix, labels)
    
    return classifier

# Binary classification
combined_bow_matrix_bin = vstack([bow_speaker1_bin, bow_speaker2_bin])
labels_bin = np.concatenate([
    np.ones(bow_speaker1_bin.shape[0]),         # Label speaker1 as 1
    np.full(bow_speaker2_bin.shape[0], 2)       # Label speaker2 as 2
])

# Multi-class classification
combined_bow_matrix_mult = vstack([bow_speaker1_bin, bow_speaker2_bin, bow_speaker_other])
labels_mult = np.concatenate([
    np.full(bow_speaker1_bin.shape[0], 1),      # Label speaker1 as 1
    np.full(bow_speaker2_bin.shape[0], 2),      # Label speaker2 as 2
    np.full(bow_speaker_other.shape[0], 3)      # Label speakerOther as 3
])

In [None]:
print("Binary Classification BoW:")
model_logistic_bin = train_BoW(combined_bow_matrix_bin, labels_bin, 'logistic')
model_knn_bin = train_BoW(combined_bow_matrix_bin, labels_bin, 'knn')

In [None]:
print("\nMulti-class Classification BoW:")
model_logistic_mult = train_BoW(combined_bow_matrix_mult, labels_mult, 'logistic')
model_knn_mult = train_BoW(combined_bow_matrix_mult, labels_mult, 'knn')

#### Custom Features

In [15]:
def train_ngram_bag(ngram_bag_matrix, labels, classifier_type='logistic'):
    if classifier_type == 'logistic':
        classifier = LogisticRegression(max_iter=1000, random_state=42)
    elif classifier_type == 'knn':
        classifier = KNeighborsClassifier(n_neighbors=5)
    
    # 5-fold cross-validation
    scores = cross_val_score(classifier, ngram_bag_matrix, labels, cv=5, scoring='accuracy')
    
    print(f"{classifier_type} model 5-fold Cross-Validation Accuracy: {np.mean(scores) * 100:.2f}%")
    
    classifier.fit(ngram_bag_matrix, labels)
    
    return classifier

# Binary classification
combined_ngram_bag_matrix_bin = vstack([custom_speaker1_bin, custom_speaker2_bin])
custom_labels_bin = np.concatenate([
    np.ones(bow_speaker1_bin.shape[0]),         # Label speaker1 as 1
    np.full(bow_speaker2_bin.shape[0], 2)       # Label speaker2 as 2
])

# Multi-class classification
combined_ngram_bag_matrix_mult = vstack([custom_speaker1_bin, custom_speaker2_bin, custom_speaker_other_bin])
custom_labels_mult = np.concatenate([
    np.full(bow_speaker1_bin.shape[0], 1),      # Label speaker1 as 1
    np.full(bow_speaker2_bin.shape[0], 2),      # Label speaker2 as 2
    np.full(bow_speaker_other.shape[0], 3)      # Label speakerOther as 3
])

In [None]:
print("Binary Classification Custom Vector:")
model_logistic_bin_custom = train_ngram_bag(combined_ngram_bag_matrix_bin, custom_labels_bin, 'logistic')
model_knn_bin_custom = train_ngram_bag(combined_ngram_bag_matrix_bin, custom_labels_bin, 'knn')

In [None]:
print("\nMulti-class Classification Custom Vector:")
model_logistic_mult_custom = train_ngram_bag(combined_ngram_bag_matrix_mult, custom_labels_mult, 'logistic')
model_knn_mult_custom = train_ngram_bag(combined_ngram_bag_matrix_mult, custom_labels_mult, 'knn')

### Testing

In [18]:
# File paths
file_path = 'knesset_sentences.txt'
output_file_path = 'classified_sentences.txt'

label_to_class = {
        1: "first", 
        2: "second",
        3: "other"
    }

# Load sentences
with open(file_path, 'r', encoding='utf-8') as f:
    sentences = f.readlines()

#### Bag of Words

In [None]:
test_bow = vectorizer_bow.transform(sentences)
logistic_predictions_mult = model_logistic_mult.predict(test_bow)

for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}{label_to_class[logistic_predictions_mult[i]]}")

#### Custom Features

In [None]:
test_custom = vectorizer_custom.transform(sentences)
logistic_predictions_mult_custom = model_logistic_mult_custom.predict(test_custom)

for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}{label_to_class[logistic_predictions_mult_custom[i]]}")