## In this notebook we take the raw data, that have been collected before. Then we tag it by the fine-tuned Hebrew-Bert.

### install and upload needed packages

In [None]:
!pip install transformers datasets

Collecting datasets
  Using cached datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Using cached datasets-2.21.0-py3-none-any.whl (527 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached multiprocess-0.70.16-py310-none-any.whl (134 kB)
[0mInstalling collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.21.0 dill-0.3.8 multiprocess-0.70.16


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from google.colab import drive

In [None]:
import os
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
folder_path = '/content/drive/MyDrive/data_mining_knesset_final/unlabeled_99to22'
files = os.listdir(folder_path)


### Main_df creation
#### - we unit sentences spooke in a raw and merged them into a speech. this reduice double featurs saving memory and make the offensivess score more reliable since the machine get more context. The to reduce running time and unwanted biases (red squers phenomena) we created text only df - this one will be given as input to the LLM

In [None]:
def unite_columns_in_df(df: pd.DataFrame, group_col: str = "speaker_name", agg_col: str = "sentence_text") -> pd.DataFrame:
    df[group_col] = df[group_col].apply(lambda x: str(x).strip())
    df['Group'] = (df[group_col] != df[group_col].shift()).cumsum()
    agg_dict = {agg_col: lambda x: ''.join(map(str, x))}

    # For all other columns, keep the first value within each group
    for col in df.columns:
        if col not in [group_col, agg_col, 'Group']:
            agg_dict[col] = 'first'

    result = df.groupby(['Group', group_col], as_index=False).agg(agg_dict)
    return result.drop(columns='Group')



In [None]:
# init the first df
main_df = pd.read_csv(os.path.join(folder_path, files[0]), encoding = "utf-8")
main_df = unite_columns_in_df(main_df)
if len(files) == 52:
# iterate over the rest of the files
  for i in range(1,len(files)):
    filename = files[i]
    print(filename)
    full_path = os.path.join(folder_path, filename)
    df_cur = pd.read_csv(full_path, encoding = "utf-8")
    df_cur = unite_columns_in_df(df_cur)
    # add the current df to the entire df
    main_df = pd.concat([main_df, df_cur], axis=0)

main_df.reset_index(drop=True, inplace=True)
main_df

filtered_knesset_corpus_batch_27.csv
filtered_knesset_corpus_batch_28.csv
filtered_knesset_corpus_batch_37.csv
filtered_knesset_corpus_batch_50.csv
filtered_knesset_corpus_batch_56.csv
filtered_knesset_corpus_batch_62.csv
filtered_knesset_corpus_batch_68.csv
filtered_knesset_corpus_batch_73.csv
filtered_knesset_corpus_batch_84.csv
filtered_knesset_corpus_batch_94.csv
filtered_knesset_corpus_batch_103.csv
filtered_knesset_corpus_batch_114.csv
filtered_knesset_corpus_batch_128.csv
filtered_knesset_corpus_batch_142.csv
filtered_knesset_corpus_batch_160.csv
filtered_knesset_corpus_batch_152.csv
filtered_knesset_corpus_batch_173_20240811-173243.csv
filtered_knesset_corpus_batch_175_20240811-174255.csv
filtered_knesset_corpus_batch_188_20240811-193837.csv
filtered_knesset_corpus_batch_191_20240811-201327.csv
filtered_knesset_corpus_batch_192_20240811-202249.csv
filtered_knesset_corpus_batch_278_20240812-085802.csv
filtered_knesset_corpus_batch_292_20240812-110056.csv
filtered_knesset_corpus_

Unnamed: 0,speaker_name,sentence_text,knesset_number,session_name,protocol_date,morphological_fields,speaker_gender,speaker_religion,speaker_residence,faction_general_name,faction_political_orientation
0,אורית אדטו,"שוב, זה רק בגלל שיש כמה מתקנים חדשים.אם מורידי...",15,"ועדת החוקה, חוק ומשפט",2002-07-01 09:00,"{'id': ['1', '2', '3', '4', '5', '6---7', '6',...",,,,,
1,אופיר פינס-פז,"אני מודיע לך, שבבתי המעצר של המשטרה, אני לא מא...",15,"ועדת החוקה, חוק ומשפט",2002-07-01 09:00,"{'id': ['1', '2', '3---4', '3', '4', '5', '6--...",male,יהודי,רעננה,העבודה,שמאל
2,מרדכי ורטהיימר,"נכון, אם אנחנו מצרפים חייבי שב""ס והמשטרה, אנחנ...",15,"ועדת החוקה, חוק ומשפט",2002-07-01 09:00,"{'id': ['1', '2', '3', '4', '5', '6', '7', '8-...",,,,,
3,אופיר פינס-פז,"תודה רבה.אגף התקציבים, אומרים פה כולם שחסרים א...",15,"ועדת החוקה, חוק ומשפט",2002-07-01 09:00,"{'id': ['1', '2', '3'], 'form': ['תודה', 'רבה'...",male,יהודי,רעננה,העבודה,שמאל
4,עמית שפייזמן,"עד כמה שידוע לי מהנתונים שיש אצלי, יש כיום תקן...",15,"ועדת החוקה, חוק ומשפט",2002-07-01 09:00,"{'id': ['1', '2', '3---4', '3', '4', '5---6', ...",,,,,
...,...,...,...,...,...,...,...,...,...,...,...
1626218,<ענת הר אבן,"""תקנות המסים (גביה) (תשלום לצד שלישי שהומצא לו...",20,"ועדת החוקה, חוק ומשפט",2017-05-16 09:00,,,,,,
1626219,אלעזר שטרן,"אני רוצה לשאול לגבי התנאי של 2,500 צווי עיקול....",20,"ועדת החוקה, חוק ומשפט",2017-05-16 09:00,,male,יהודי,מצפה הושעיה,יש עתיד,מרכז
1626220,<ענת הר אבן,"לא, אין פה הבדל בין ההוצאה לפועל למרכז לגביית ...",20,"ועדת החוקה, חוק ומשפט",2017-05-16 09:00,,,,,,
1626221,אלעזר שטרן,"בסדר גמור, אבל למה התנאי של 2,500 צווים?",20,"ועדת החוקה, חוק ומשפט",2017-05-16 09:00,,male,יהודי,מצפה הושעיה,יש עתיד,מרכז


In [None]:
main_df = main_df.drop_duplicates()

(1166075, 11)

#### int the main df we had more then 2.6M setnces - after the speeches merging we hve 1.16M speeches. significant running time and memory save

In [None]:
for i in range(1, 5):
    k = len(main_df) // 4
    h = (i - 1) * k
    if i == 4:
        j = len(main_df)  # Ensure the last part captures all remaining rows
    else:
        j = i * k
    main_df_part = main_df[h:j]
    main_df_part.to_csv(f'/content/drive/MyDrive/data_mining_knesset_final/unlabeled_99to22/main_df_unttaged_part_{i}.csv', index=False)
    print(f"Successful save of part {i}")
    print(main_df_part.shape)


Successful save of part 1
(291518, 11)
Successful save of part 2
(291518, 11)
Successful save of part 3
(291518, 11)
Successful save of part 4
(291521, 11)


In [None]:
text_only_main_df = main_df[['sentence_text']]
text_only_main_df.to_csv('/content/drive/MyDrive/data_mining_knesset_final/unlabeled_99to22/text_only_main_df.csv', index=False)

In [None]:
text_only_main_df

Unnamed: 0,sentence_text
0,"שוב, זה רק בגלל שיש כמה מתקנים חדשים.אם מורידי..."
1,"אני מודיע לך, שבבתי המעצר של המשטרה, אני לא מא..."
2,"נכון, אם אנחנו מצרפים חייבי שב""ס והמשטרה, אנחנ..."
3,"תודה רבה.אגף התקציבים, אומרים פה כולם שחסרים א..."
4,"עד כמה שידוע לי מהנתונים שיש אצלי, יש כיום תקן..."
...,...
1626218,"""תקנות המסים (גביה) (תשלום לצד שלישי שהומצא לו..."
1626219,"אני רוצה לשאול לגבי התנאי של 2,500 צווי עיקול...."
1626220,"לא, אין פה הבדל בין ההוצאה לפועל למרכז לגביית ..."
1626221,"בסדר גמור, אבל למה התנאי של 2,500 צווים?"


##pull the model and run it over the text only df

In [None]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/My Drive/fine-tuned-hebert")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/fine-tuned-hebert")

In [None]:
def predict_large_dataframe(df, model, tokenizer, batch_size=32):
    model.eval()
    predictions = []

    # Move model to appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Process in batches
    for i in range(0, len(df), batch_size):
        if (i%50000) == 0:
          print(f"i = {i}")
        batch_texts = df['sentence_text'].iloc[i:i+batch_size].tolist()
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(predicted_labels)

    return predictions


In [None]:
results = predict_large_dataframe(text_only_main_df, model, tokenizer)
text_only_main_df['predicted_label'] = results
text_only_main_df.to_csv('/content/drive/MyDrive/data_mining_knesset_final/unlabeled_99to22/text_only_main_df_with_predicted_labels.csv', index=False)

i = 0
i = 100000
i = 200000
i = 300000
i = 400000
i = 500000
i = 600000
i = 700000
i = 800000
i = 900000
i = 1000000
i = 1100000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_only_main_df['predicted_label'] = results


In [None]:
main_df['predicted_label'] = results


for i in range(1, 5):
    k = len(main_df) // 4
    h = (i - 1) * k
    if i == 4:
        j = len(main_df)  # Ensure the last part captures all remaining rows
    else:
        j = i * k
    main_df_part = main_df[h:j]
    main_df_part.to_csv(f'/content/drive/MyDrive/data_mining_knesset_final/unlabeled_99to22/main_df_ttaged_part_{i}.csv', index=False)
    print(f"Successful save of part {i}")
    print(main_df_part.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df['predicted_label'] = results


Successful save of part 1
(291518, 12)
Successful save of part 2
(291518, 12)
Successful save of part 3
(291518, 12)
Successful save of part 4
(291521, 12)


In [None]:
text_only_main_df['predicted_label'] = results
text_only_main_df.to_csv('/content/drive/MyDrive/data_mining_knesset_final/unlabeled_99to22/text_only_main_df_with_predicted_labels.csv', index=False)

for i in range(1, 5):
    k = len(text_only_main_df) // 4
    h = (i - 1) * k
    if i == 4:
        j = len(text_only_main_df)  # Ensure the last part captures all remaining rows
    else:
        j = i * k

    text_only_main_df_part = text_only_main_df[h:j]
    text_only_main_df_part.to_csv(f'/content/drive/MyDrive/data_mining_knesset_final/unlabeled_99to22/text_only_main_df_tagged_part_{i}.csv', index=False)


In [None]:
print("if this is printed, code ran till end you can turn off the gpu")

if this is printed, code ran till end you can turn off the gpu


# When finish working on the Data and it's well-parsed we can now transfer it to a relevnt directory and analys it

In [None]:
# move the tagged files into a new folder
input_folder = "/content/drive/MyDrive/data_mining_knesset_final/unlabeled_99to22"
output_folder = "/content/drive/MyDrive/data_mining_knesset_final/tagged_99_22"
os.makedirs(output_folder, exist_ok=True)

# Iterate over all files in the input folder and copy them to the output folder
for filename in os.listdir(input_folder):
  if "ttaged" in filename:
      input_file_path = os.path.join(input_folder, filename)
      output_file_path = os.path.join(output_folder, filename)

      if os.path.isfile(input_file_path):  # Ensure it's a file and not a directory
          shutil.copy(input_file_path, output_file_path)
          print(f"Copied {input_file_path} to {output_file_path}")

print("All files have been copied successfully.")