<a href="https://colab.research.google.com/github/manor-s/knesset_24/blob/main/Data_Science_Course_Project_Regression_Manor_Shpriz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<font size="6">Instructions of Use.</font>

* Upload Knesset Protocol .csv file to Files section.
* Run the cell below.
* Create a new cell, from there you can run the code:
```
run_model("your_file_name.csv")
```
* The prediction of likud lines is given as a printed output, and also as a returned value, for your convenience.
* Thank You.

<br>

<font size="6">Model Building Methodology.</font>

* Grouping by subject.
* Exctracting total lines for each subject.
* Calculating sum of words for subject.
* Calculating number of lines by selected parties for each subject.
* Calculating number of references from chair of likud members (most probably invites).
* Calculating number of refernces from formal speaker (most probably dialogue) of likud members.
* Categorizing the subject (only one category was shown to be siginficant) and creating an index of total lines per subject.
* Calaculating appearance of important words for subject (seperately), word that were gathered by simple nlp processing, and found to be with high corr to likud lines.




In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.linear_model import ElasticNet
import pickle
import requests


#download model
url = "https://drive.google.com/uc?id=1QxEkkaGQNVERLeB6O3aj6ES32EgiX6jJ"

response = requests.get(url)
metadata = pickle.loads(response.content)  # Load directly from response

reg_model = metadata["model"]


#target function
def run_model(filename):
    test_df = pd.read_csv(filename)
    research_file = make_research_file(test_df)
    likud_lines = calculate_likud_lines(research_file, reg_model)
    print(f"Likud lines: {likud_lines}")
    return likud_lines


#invites of likud members from text column
def count_likud_invites(text, mk_data):
    count = 0
    seen_mentions = set()  # Track processed MKs to avoid duplicates

    # Find all last names in the dataset to check for ambiguity
    all_last_names = mk_data["last_name"].value_counts()

    for _, mk in mk_data.iterrows():
        if mk["party"] == "הליכוד":
            # Adjust titles based on gender, including "ה" prefix
            titles = ["חבר כנסת", "חבר הכנסת"] if mk["gender"] == "male" else ["חברת כנסת", "חברת הכנסת"]

            # Generate patterns for full names
            full_name_patterns = [
                f"{title} {mk['first_name']} {mk['last_name']}" for title in titles
            ] + [
                f"{title} {mk['middle_name']} {mk['last_name']}" for title in titles if mk['middle_name']
            ]

            # Generate patterns for last names
            last_name_patterns = [f"{title} {mk['last_name']}" for title in titles] + [f"{mk['last_name']}"]

            # Check full name patterns first
            for pattern in full_name_patterns:
                if mk['last_name'] in seen_mentions:  # Skip if already matched
                    break
                if re.search(pattern, text):
                    count += 1
                    seen_mentions.add(mk['last_name'])
                    break  # Prioritize full name match

            # Check last name patterns as a fallback
            if mk['last_name'] not in seen_mentions:
                # Validate last name is not ambiguous
                if all_last_names[mk['last_name']] == 1:  # Ensure it's unique to Likud
                    for pattern in last_name_patterns:
                        if re.search(pattern, text):
                            count += 1
                            seen_mentions.add(mk['last_name'])
                            break  # Stop after matching last name

    return count


#looser refferals of likud members from text column
def count_likud_refs(text, mk_data):
    count = 0
    seen_mentions = set()  # Track processed MKs to avoid duplicates

    # Handle cases where text is null or NaN
    if not isinstance(text, str):
        return 0

    # Find all last names and first names in the dataset to check for ambiguity
    all_last_names = mk_data["last_name"].value_counts()
    all_first_names = mk_data["first_name"].value_counts()

    for _, mk in mk_data.iterrows():
        if mk["party"] == "הליכוד":
            # Adjust titles based on gender, including "ה" prefix
            titles = ["חבר כנסת", "חבר הכנסת"] if mk["gender"] == "male" else ["חברת כנסת", "חברת הכנסת"]

            # Generate patterns for full names
            full_name_patterns = [
                f"{title} {mk['first_name']} {mk['last_name']}" for title in titles
            ] + [
                f"{title} {mk['middle_name']} {mk['last_name']}" for title in titles if mk['middle_name']
            ]

            # Generate patterns for last names
            last_name_patterns = [f"{title} {mk['last_name']}" for title in titles] + [f"{mk['last_name']}"]

            # Generate patterns for first names as a fallback
            first_name_patterns = [f"{title} {mk['first_name']}" for title in titles] + [f"{mk['first_name']}"]

            # Check full name patterns first
            for pattern in full_name_patterns:
                if mk['last_name'] in seen_mentions:  # Skip if already matched
                    break
                if re.search(pattern, text):
                    count += 1
                    seen_mentions.add(mk['last_name'])
                    break  # Prioritize full name match

            # Check last name patterns as a fallback
            if mk['last_name'] not in seen_mentions:
                # Validate last name is not ambiguous
                if all_last_names[mk['last_name']] == 1:  # Ensure it's unique to Likud
                    for pattern in last_name_patterns:
                        if re.search(pattern, text):
                            count += 1
                            seen_mentions.add(mk['last_name'])
                            break  # Stop after matching last name

            # Check first name patterns as the final fallback
            if mk['last_name'] not in seen_mentions and mk['first_name'] not in seen_mentions:
                # Validate first name is not ambiguous
                if all_first_names[mk['first_name']] == 1:  # Ensure it's unique to Likud
                    for pattern in first_name_patterns:
                        if re.search(pattern, text):
                            count += 1
                            seen_mentions.add(mk['first_name'])  # Mark first name as matched
                            break  # Stop after matching first name

    return count


#classification of subject column (adjusted to the only ane with corr > 0.5 with likud lines in train set)
def classify_subject(subject):
    if re.search(r"חייל|צבא|בטחון|טרור|שב\"כ|עימותים|הטרור|המינהל האזרחי|אלימות", subject):
        return "Defense_Security"


#count word occurences in text
def count_word_occurrences(text, word):
    pattern = rf'{re.escape(word)}'
    return len(re.findall(pattern, text))


#research set maker from basic protocol file
def make_research_file(protocols):

    mk_data = pd.read_csv("https://drive.google.com/uc?id=1cfjQNsuYScMnyqyeJ6y1MtJ7rE87wm-2")

    if protocols["subject"].notna().any():
        most_common_subject = protocols["subject"].mode().iloc[0]  # Find the most common subject
        protocols["subject"] = protocols["subject"].fillna(most_common_subject)
    else:
        # If all values are missing, fill with "null"
        protocols["subject"] = "null"

    research_data = pd.DataFrame(protocols["subject"].unique(), columns=["subject"])
    filtered_protocols = protocols.copy()
    filtered_protocols["word_count"] = filtered_protocols["text"].apply(lambda x: len(str(x).split()))
    filtered_lines = filtered_protocols.groupby("subject").size().reset_index(name="filtered_lines")
    research_data = research_data.merge(filtered_lines, on="subject", how="left")
    filtered_lines = filtered_protocols.groupby("subject")["word_count"].sum().reset_index(name="word_count")
    research_data = research_data.merge(filtered_lines, on="subject", how="left")
    research_data["filtered_lines"] = research_data["filtered_lines"].fillna(0).astype(int)


    # lines by party

    data = filtered_protocols.copy()

    # List of parties with >0.5 corr with likud lines
    parties = ['הציונות הדתית', 'הרשימה המשותפת', 'יש עתיד', 'ישראל ביתנו', 'כחול לבן', 'ש"ס']

    # Group data by subject and count occurrences of each party
    grouped_data = data.groupby("subject")["party"].value_counts().unstack(fill_value=0).reset_index()

    # Ensure all parties are in the columns, even if they don't appear in the data
    for party in parties:
        if party not in grouped_data.columns:
            grouped_data[party] = 0

    # Reorder columns (optional) to ensure consistency with the party list
    grouped_data = grouped_data[["subject"] + parties]

    research_data = research_data.merge(grouped_data, on="subject", how="left")


    # Likud members invites by chairperson

    data = filtered_protocols[filtered_protocols["role"].isin(['יו"ר', 'היו"ר', 'יו"ר הכנסת'])].copy()

    if data.empty:
        # Create an empty grouped_data DataFrame with the same structure
        grouped_data = pd.DataFrame(columns=["subject", "likud_invites"])
    else:
        # Apply function to count invites
        data["likud_invites"] = data["text"].apply(lambda x: count_likud_invites(x, mk_data))

    # Group by subject and calculate likud_invites
    grouped_data = data.groupby("subject", as_index=False).agg({"likud_invites": "sum"})

    research_data = research_data.merge(grouped_data, on="subject", how="left")


    # mentions of Likud MKs by formal speaker

    data = filtered_protocols[
    (filtered_protocols["role"].notnull()) &
    (~filtered_protocols["role"].isin(['יו"ר', 'היו"ר', 'יו"ר הכנסת']))
    ].copy()

    if data.empty:
        # Create an empty grouped_data DataFrame with the required structure
        grouped_data = pd.DataFrame(columns=["subject", "likud_refs"])
    else:
        # Apply the function to count Likud references
        data["likud_refs"] = data["text"].apply(lambda x: count_likud_refs(x, mk_data))

        # Group by subject and calculate Likud references
        grouped_data = (
            data.groupby("subject", as_index=False)
            .agg(likud_refs=("likud_refs", "sum"))
        )

    research_data = research_data.merge(grouped_data, on="subject", how="left")

    research_data["likud_refs"] = research_data["likud_refs"].fillna(0).astype(int)


    # Categorize subjects and compute category-specific scores based on the number of lines

    data = filtered_protocols.copy()
    data["category"] = data["subject"].apply(classify_subject)
    data["lines"] = 1

    pivot_table = data.pivot_table(
        index="subject",
        columns="category",
        values="lines",
        aggfunc="sum",
        fill_value=0
    )

    expected_categories = ["Defense_Security"]  # handle missing categories
    for category in expected_categories:
        if category not in pivot_table.columns:
            pivot_table[category] = 0


    pivot_table.columns = [f"subject_category_{col}" for col in pivot_table.columns]
    ft = pivot_table.reset_index()

    research_data = research_data.merge(ft,  on="subject", how="left")


    # Count base words occurances

    word_list = [
        "חבר", "ישראל", "מדינה", "ממשלה", "שר",
        "חשוב", "אזרח", "ערבי", "כבוד", "חייל", "משפט", "ביטחון",
        "קואליציה"
        ]

    data = filtered_protocols[~filtered_protocols["role"].isin(['יו"ר', 'היו"ר', 'יו"ר הכנסת'])].copy()

    if data.empty:
        # Create an empty grouped_data DataFrame with the required structure
        grouped_data = pd.DataFrame(columns=["subject"] + [f"count_{word}" for word in word_list])
    else:
        # Ensure text column is non-null and of string type
        data["text"] = data["text"].fillna("").astype(str)

        # Group by subject and aggregate text
        grouped_data = data.groupby("subject")["text"].apply(" ".join).reset_index()

        # Create a column for each word and count its occurrences in the aggregated "text" column
        for word in word_list:
            grouped_data[f"count_{word}"] = grouped_data["text"].apply(lambda x: count_word_occurrences(x, word))

        # Drop the aggregated text column
        grouped_data = grouped_data.drop(columns=["text"])

    research_data = research_data.merge(grouped_data, on="subject", how="left")

    research_data = research_data.fillna(0)


    corr_matrix = research_data.drop(["subject"], axis=1).corr()

    #output of pd_to_investigate
    return research_data



def calculate_likud_lines(research_data, model):

    y_pred = model.predict(research_data.drop(columns=["subject"], axis=1))

    y_pred = np.where(y_pred < 0, 0, y_pred)

    return y_pred.sum()





