This notebook pulls the data from Elastic and saves it to drive.

In [None]:
!pip install datasets
!pip install elasticsearch

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [None]:
from google.colab import drive
import pandas as pd
from datasets import load_dataset

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the dataset with streaming 2022-10-30 12:30
ds = load_dataset("HaifaCLGroup/knessetCorpus", name="committees_all_features_sentences", split="train", streaming=True, trust_remote_code=True)

# Parameters
max_iter = 32300000
samples = []
batch_size = 100000  # Process in batches of 100,000
total_samples = 0

# Columns to keep
relevant_cols = ["knesset_number", "session_name", "protocol_date", "speaker_name",
                 "sentence_text", "morphological_fields", "speaker_gender",
                 "speaker_religion", "speaker_residence", "faction_general_name",
                 "faction_political_orientation"]

print("Starting the filtering process...")

for i, sample in enumerate(ds):
    if i % 100000 == 0:  # Print progress every 100,000 iterations
        print(f"Processing sample {i}/{max_iter}")

    # Filter samples by session name and other conditions
    if sample["session_name"] == "ועדת החוקה, חוק ומשפט":
        filtered_sample = {k: v for k, v in sample.items() if k in relevant_cols}
        samples.append(filtered_sample)
        total_samples += 1

    # Save and reset the batch
    if len(samples) >= batch_size:
        df_filtered = pd.DataFrame(samples)
        df_filtered.to_csv(f'/content/drive/My Drive/filtered_knesset_corpus_batch_{i//batch_size}.csv', index=False)
        samples = []  # Clear the batch

    if i == max_iter - 1:
        break

# Save any remaining samples
if samples:
    df_filtered = pd.DataFrame(samples)
    df_filtered.to_csv(f'/content/drive/My Drive/filtered_knesset_corpus_batch_final.csv', index=False)

print(f"Finished processing. Total samples found: {total_samples}")


KeyboardInterrupt: 

## After passing 16.4 million rows the running time crashed. we will continue pulling the data from there.

In [None]:
import time

# Load the dataset with streaming
ds = load_dataset("HaifaCLGroup/knessetCorpus", name="committees_all_features_sentences", split="train", streaming=True, trust_remote_code=True)

# Parameters
max_iter = 32800000
samples = []
batch_size = 100000  # Process in batches of 100,000
total_samples = 0
start_iter = 16400000  # Set the starting point to 16.5 million

# Columns to keep
relevant_cols = ["knesset_number", "session_name", "protocol_date", "speaker_name",
                 "sentence_text", "morphological_fields", "speaker_gender",
                 "speaker_religion", "speaker_residence", "faction_general_name",
                 "faction_political_orientation"]

# Function to generate a timestamped filename
def get_timestamped_filename(base_name, i):
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    return f'/content/drive/My Drive/{base_name}_batch_{i//batch_size}_{timestamp}.csv'

print("Resuming the filtering process...")

# Custom loop starting from a specific iteration
for i, sample in zip(range(start_iter, max_iter), ds):
    if i % 100000 == 0:  # Print progress every 100,000 iterations
        print(f"Processing sample {i}/{max_iter}")

    # Filter samples by session name and other conditions
    if sample["session_name"] == "ועדת החוקה, חוק ומשפט":
        filtered_sample = {k: v for k, v in sample.items() if k in relevant_cols}
        samples.append(filtered_sample)
        total_samples += 1

    # Save and reset the batch
    if len(samples) >= batch_size:
        df_filtered = pd.DataFrame(samples)
        filename = get_timestamped_filename('filtered_knesset_corpus', i)
        df_filtered.to_csv(filename, index=False)
        samples = []  # Clear the batch

    if i == max_iter - 1:
        break

# Save any remaining samples
if samples:
    df_filtered = pd.DataFrame(samples)
    filename = get_timestamped_filename('filtered_knesset_corpus', 'final')
    df_filtered.to_csv(filename, index=False)

print(f"Finished processing. Total samples found: {total_samples}")


## After analyzing the data, we observed that there are no observations between 2016 and 2022. Therefore, we will attempt to pull this specific data again from the Elasticsearch engine of the Knesset corpus.

#2016-2022 Data


In [None]:
import pandas as pd
from datetime import datetime
from elasticsearch import Elasticsearch

# Initialize Elasticsearch connection
elastic_ip = '34.0.64.248:9200'
es_username = 'user'
es_password = 'knesset'
es = Elasticsearch(f'http://{elastic_ip}', http_auth=(es_username, es_password), timeout=100)

# Define relevant columns and query
relevant_cols = [
    "knesset_number", "session_name", "protocol_date", "speaker_name",
    "sentence_text", "speaker_gender", "speaker_religion",
    "speaker_residence", "faction_general_name", "faction_political_orientation"
]

query = {
    "_source": relevant_cols,
    "query": {
        "bool": {
            "must": [
                {"term": {"session_name.keyword": "ועדת החוקה, חוק ומשפט"}},  # Use term query for exact match
                {"range": {"protocol_date": {"gte": "2016-01-01 00:00", "lte": "2022-10-25 23:59"}}}
            ]
        }
    },
    "sort": [
        {"protocol_date": {"order": "desc"}}
    ],
    "size": 10000
}

# Initialize the counter for the file naming and observation count
file_counter = 1
total_written = 0

# Create an empty DataFrame to accumulate data before saving
combined_df = pd.DataFrame()

# Execute the initial search query
resp = es.search(index="all_features_sentences", body=query, scroll='2m')

# Get the scroll ID
scroll_id = resp['_scroll_id']

# Continue scrolling and saving in batches
while len(resp['hits']['hits']):
    # Convert the current batch of hits to a DataFrame
    df = pd.DataFrame([hit["_source"] for hit in resp['hits']['hits']])
    combined_df = pd.concat([combined_df, df], ignore_index=True)
    total_written += len(df)

    # Check if we've accumulated 50,000 observations
    if total_written >= 50000:
        # Save to a new file
        combined_df.to_csv(f'knesset_committee_2016_to_2022_part_{file_counter}.csv', mode='w', index=False, encoding='utf-8-sig')

        # Increment the file counter and reset variables
        file_counter += 1
        combined_df = pd.DataFrame()
        total_written = 0

    # Continue scrolling
    resp = es.scroll(scroll_id=scroll_id, scroll='2m')

# Save any remaining data that didn't reach 50,000
if not combined_df.empty:
    combined_df.to_csv(f'/content/drive/My Drive/data_mining_knesset_final/knesset_committee_2016_to_2022_part_{file_counter}.csv', mode='w', index=False, encoding='utf-8-sig')

print(f"CSV files have been saved in batches of 50,000 observations. Total files: {file_counter}")

  es = Elasticsearch(f'http://{elastic_ip}', http_auth=(es_username, es_password), timeout=100)
  es = Elasticsearch(f'http://{elastic_ip}', http_auth=(es_username, es_password), timeout=100)


OSError: Cannot save file into a non-existent directory: '/content/drive/My Drive/data_mining_knesset_final'

###Once again, throughout our work, we found that there is no data available between 1992 and January 1999. We attempted to retrieve this data from the Elasticsearch engine, but were disappointed to find that it was also unavailable there.
####1992-1999 Data

In [None]:
import pandas as pd
from datetime import datetime
from elasticsearch import Elasticsearch


# Initialize Elasticsearch connection
elastic_ip = '34.0.64.248:9200'
es_username = 'user'
es_password = 'knesset'
es = Elasticsearch(f'http://{elastic_ip}', http_auth=(es_username, es_password), timeout=100)

  es = Elasticsearch(f'http://{elastic_ip}', http_auth=(es_username, es_password), timeout=100)
  es = Elasticsearch(f'http://{elastic_ip}', http_auth=(es_username, es_password), timeout=100)


In [None]:


# # Initialize Elasticsearch connection
# elastic_ip = '34.0.64.248:9200'
# es_username = 'user'
# es_password = 'knesset'
# es = Elasticsearch(f'http://{elastic_ip}', http_auth=(es_username, es_password), timeout=100)

# Define relevant columns and query
relevant_cols = [
    "knesset_number", "session_name", "protocol_date", "speaker_name",
    "sentence_text", "speaker_gender", "speaker_religion",
    "speaker_residence", "faction_general_name", "faction_political_orientation"
]

query = {
    "_source": relevant_cols,
    "query": {
        "bool": {
            "must": [
                {"term": {"session_name.keyword": "ועדת החוקה, חוק ומשפט"}},  # Use term query for exact match
                {"range": {"protocol_date": {"gte": "1992-07-15 00:00", "lte": "1999-07-19 23:59"}}}
            ]
        }
    },
    "sort": [
        {"protocol_date": {"order": "desc"}}
    ],
    "size": 10000
}

# Initialize the counter for the file naming and observation count
file_counter = 1
total_written = 0

# Create an empty DataFrame to accumulate data before saving
combined_df = pd.DataFrame()

# Execute the initial search query
resp = es.search(index="all_features_sentences", body=query, scroll='2m')

# Get the scroll ID
scroll_id = resp['_scroll_id']

# Continue scrolling and saving in batches
while len(resp['hits']['hits']):
    # Convert the current batch of hits to a DataFrame
    df = pd.DataFrame([hit["_source"] for hit in resp['hits']['hits']])
    combined_df = pd.concat([combined_df, df], ignore_index=True)
    total_written += len(df)

    # Check if we've accumulated 50,000 observations
    if total_written >= 50000:
        # Save to a new file
        combined_df.to_csv(f'knesset_committee_1992_to_1999_part_{file_counter}.csv', mode='w', index=False, encoding='utf-8-sig')

        # Increment the file counter and reset variables
        file_counter += 1
        combined_df = pd.DataFrame()
        total_written = 0

    # Continue scrolling
    resp = es.scroll(scroll_id=scroll_id, scroll='2m')

# Save any remaining data that didn't reach 50,000
if not combined_df.empty:
    combined_df.to_csv(f'/content/drive/My Drive/data_mining_knesset_final/knesset_committee_1992_to_1999_part_{file_counter}.csv', mode='w', index=False, encoding='utf-8-sig')

print(f"CSV files have been saved in batches of 50,000 observations. Total files: {file_counter}")


CSV files have been saved in batches of 50,000 observations. Total files: 1
