# Configuration

In [1]:
import pyspark
import numpy as np
from pyspark.sql import SQLContext, SparkSession
import sys, os
import pandas as pd
import re
import nltk
import warnings
PROJECT_ROOT = r'C:\Users\lotem\knesset_data'
META_SOURCE = os.path.join(PROJECT_ROOT, 'data_committees_kns_committeesession_kns_committeesession.csv')
COMMITTEESS_ROOT = os.path.join(PROJECT_ROOT, 'meeting_protocols_parts')
PEOPLE_SOURCE = os.path.join(PROJECT_ROOT, 'data_members_kns_person_kns_person.csv')
WORD_COUNT_TARGET = os.path.join(PROJECT_ROOT, 'word_count')

# Get meta

In [2]:
meta = pd.read_csv(META_SOURCE)
meta = meta[~meta['parts_parsed_filename'].isnull()]
meta.head(2)

Unnamed: 0,CommitteeSessionID,Number,KnessetNum,TypeID,TypeDesc,CommitteeID,Location,SessionUrl,BroadcastUrl,StartDate,...,download_filename,download_filesize,parts_crc32c,parts_filesize,parts_parsed_filename,text_crc32c,text_filesize,text_parsed_filename,topics,committee_name
9,64550,4.0,16,161,פתוחה,24,"חדר הוועדה, באגף הוועדות (קדמה), קומה 1, חדר 1820",http://main.knesset.gov.il/Activity/committees...,,2003-03-17 09:30:00,...,files/23/3/6/367838.DOC,64814.0,4MXPig==,96793.0,files/6/4/64550.csv,lSu4Ug==,97645.0,files/6/4/64550.txt,"[""שדה התעופה בהרצליה - כפר שמריהו""]",הפנים ואיכות הסביבה
10,64551,2.0,16,161,פתוחה,24,"חדר הוועדה, באגף הוועדות (קדמה), קומה 1, חדר 1820",http://main.knesset.gov.il/Activity/committees...,,2003-03-11 10:30:00,...,files/23/3/6/367839.DOC,68247.0,o5iVPg==,115462.0,files/6/4/64551.csv,eHpjug==,116107.0,files/6/4/64551.txt,"[""הזרמת מי שופכין לים תיכון ""]",הפנים ואיכות הסביבה


In [3]:
ppldf = pd.read_csv(PEOPLE_SOURCE)
ppldf.head()

Unnamed: 0,PersonID,LastName,FirstName,GenderID,GenderDesc,Email,IsCurrent,LastUpdatedDate
0,30299,נתונים,אין,251,זכר,,True,2000-01-01 00:00:00
1,1026,אברהם-בלילא,רוחמה,250,נקבה,,False,2015-03-20 12:03:08
2,1029,רצון,מיכאל,251,זכר,,False,2015-03-20 12:03:08
3,1030,והבה,מגלי,251,זכר,,False,2015-03-20 12:03:08
4,1031,אדרי,יעקב,251,זכר,,False,2015-03-20 12:03:08


# inspect first file

In [4]:
def get_full_url(metadf, root_folder):
    if type(metadf) is pd.DataFrame:
        return [os.path.join(root_folder,'meeting_protocols_parts', x[1]['parts_parsed_filename']) 
            for x in metadf.iterrows()]
    else:
        return os.path.join(root_folder,'meeting_protocols_parts', metadf['parts_parsed_filename']) 
    

def remove_special_character(strg):
    CHARS_TO_FILTER = '\.|,|"|\(|\)|;|:|\?'
    CHARS_TO_WHITE = '\t|\n'
    if type(strg) is str:
        strg = re.sub(CHARS_TO_FILTER, '', strg)
        strg = re.sub(CHARS_TO_WHITE, ' ', strg)
    else:
        strg = str('')   
    return strg

def remove_person_title(strng):
    constName = "היור "
    if strng.startswith(constName):
        strng = strng[len(constName):]
    return strng

def remove_calls(df, **kwargs):
    df = df.drop(df[df['header']=='קריאה'].index, **kwargs)
    return df
    
def fix_header(df, meta):
    df.drop(index=0, inplace=True)
    if df['header'][1] == 'נכחו':
        df.drop(index=1, inplace=True)
    
    others = {'header': ['_topics', '_committee_name', '_number', '_KnessetNum'],
        'body': [meta['topics'],meta['committee_name'], meta['Number'], meta['KnessetNum']]}
    
    others = pd.DataFrame.from_dict(others)
    df = df.append(others, sort=False)
    df.reset_index(inplace=True, drop=True)
    return df

def fix_speakers_names(df):
    all_names = df['header'][~df['header'].str.startswith('_')].unique()
    base_names = []
    names_dict = {}
    to_remove = np.full(len(all_names),False)
    for i, nm in enumerate(all_names):
        dists = np.array([nltk.edit_distance(nm, x) for x in base_names])
        if np.all(dists>3):
            base_names.append(nm)
            names_dict[nm] = nm
        else:
             names_dict[nm] = base_names[np.argmin(dists)]
    df['header'].apply(lambda x: names_dict.get(x,x))
    df = df.append(pd.DataFrame.from_dict({'header':'_speakers', 'body':all_names}))
    return df

def add_speaker_id_to_comittee_session(df, ppldf):
    ppldf['conc'] = ppldf['FirstName']+' ' + ppldf['LastName']
    df_speakers = df['body'][df['header']=='_speakers'].values
    speaker2id = {}
    for speaker in df_speakers:
        if np.any(ppldf['conc'].values==speaker):
            speaker2id[speaker] = ppldf['PersonID'][ppldf['conc'].values==speaker].values[0]
        else:
            dists = np.array([nltk.edit_distance(speaker, x) for x in ppldf['conc']])
            amin = np.argmin(dists).astype(int)
            if dists[amin]<3:
                speaker2id[speaker] = ppldf.iloc[amin]['PersonID']
    df['PersonID'] = df['header'].apply(lambda x: speaker2id.get(x,''))
    return df
    
def load_and_preprocess(meta_row, root_folder, peopleDF):
    filepath = get_full_url(meta_row,root_folder)
    fileContent = pd.read_csv(filepath)
    fileContent = fileContent.applymap(remove_special_character).applymap(remove_person_title)
    fileContent = remove_calls(fileContent)
    fileContent = fix_header(fileContent, meta_row)
    fileContent = fix_speakers_names(fileContent)
    fileContent = add_speaker_id_to_comittee_session(fileContent, peopleDF)
    return fileContent

In [8]:
file = meta.head(1)
#load_and_preprocess(file.iloc[0,:], root_folder=PROJECT_ROOT, peopleDF = ppldf)
res = {}
display('ccc')
for f in file.iterrows():
    try:
    tres = (load_and_preprocess(f[1], root_folder=PROJECT_ROOT, peopleDF = ppldf))
    display(tres)
    fulladdress = os.path.join(PROJECT_ROOT,'speakers',f[1]['parts_parsed_filename'])
    if not os.path.exists(os.path.dirname(fulladdress)):
        os.makedirs(os.path.dirname(ffulladdress))
    tres.to_csv(fulladdress)
    res[f[1]['CommitteeSessionID']] = tres['PersonID'].unique()
    except:
        res[f[1]['CommitteeSessionID']] = []
        cs = f[1]['CommitteeSessionID']
        warnings.warn(f'error in {cs}')
print('finished')

'ccc'

Unnamed: 0,header,body,PersonID
0,יורי שטרן,שלום לכולם לא תיארתי לי עד כמה הנושא שעל סדר ה...,430
1,אהוד יתום,אני מבקש לומר משהו אם הנושא נמצא בדיון בבגץ – ...,1032
2,יורי שטרן,יש הפרדת רשויות ובין היתר היא מתבטאת בכך ששום ...,430
3,אהוד יתום,מה יקבע בעניין הזה החלטה שלנו או החלטה של בגץ,1032
4,אילן שלגי,אני אשיב על כך אתחיל בתשובה לשאלתו של חבר-הכנס...,1046
5,לאה נס,כמה שנים שדה התעופה קיים,1041
6,שמואל בן טובים,הוא קיים למעלה מ-50 שנה,
7,אילן שלגי,הוא קיים מאז 1950 אני מבקש להקריא רק מכתבים ...,1046
8,יורי שטרן,למה הכוונה,430
9,אילן שלגי,צריך לשמוע יותר פרטים על כך מראשי הרשויות מתוך...,1046


finished


In [None]:
res
import pickle
pickle.dump(res, open( os.path.join(PROJECT_ROOT,'speakers.pkl'), "wb" ))