Check if there is a consistent structure in the main fields of all json files:

In [151]:
import os, json
import pandas as pd

path_to_json = '../webscraping/json_exports/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

main_fields = ["sessionInfo", "mashrooh"]
sessionInfo_fields = [
    "term",
    "ejlasie",
    "preset_mps",
    "session",
    "date"]

max_orders = 0
max_orders_session = 0
for json_file in json_files:
    # Opening JSON file
    json_file = open(path_to_json + json_file,'r') 

    # returns JSON object as a dictionary 
    session = json.load(json_file)

    if (len(session["mashrooh"]) > max_orders):
        max_orders = len(session["mashrooh"])
        max_orders_session =session["sessionInfo"]["session"]

print("By now, a session (session " + str(max_orders_session) +") might have up to " + str(max_orders) + " number of orders.")

By now, a session (session 363) might have up to 19 number of orders.


Transform sessions information JSONs to the dataframe session_df:

In [161]:
import pandas as pd

session_fields = {
    "date": [],
    "term" : [],
    "ejlasie": [],
    "present_mps": [],
    "session": [],
    "url": []}

session_information_df = pd.DataFrame(data=session_fields)

for json_file in json_files:
    # Opening JSON file
    json_file = open(path_to_json + json_file,'r') 

    # returns JSON object as a dictionary 
    dict_data = json.load(json_file)

    session_information_df.loc[session_information_df.shape[0]] = dict_data["sessionInfo"]

# Extract year, month and day from the date string
session_information_df['date_year'] = session_information_df["date"].str.slice(0, 4)
session_information_df['date_month'] = session_information_df["date"].str.slice(5, 7)
session_information_df['date_day'] = session_information_df["date"].str.slice(8, 10)

session_information_df = session_information_df.sort_values(["date", "session"])
print(session_information_df.head(40))

session_information_df.to_csv("export_dataframes/session_information.csv")

          date term ejlasie present_mps session  \
0   1397-02-19   10       2         194     204   
1   1397-12-19   10       3         196     300   
2   1397-12-21   10       3         196     302   
3   1397-12-26   10       3         194     303   
4   1398-01-18   10       3         203     304   
5   1398-01-19   10       3         198     305   
6   1398-01-20   10       3         208     306   
7   1398-01-20   10       3         197     307   
8   1398-01-25   10       3         219     308   
9   1398-01-26   10       3         195     309   
10  1398-01-27   10       3         205     310   
11  1398-01-27   10       4         194     311   
12  1398-02-02   10       3         225     312   
13  1398-02-03   10       3         194     314   
14  1398-02-16   10       3         249     316   
15  1398-02-24   10       3         231     321   
16  1398-02-24   10       3         195     322   
17  1398-02-29   10       3         204     323   
18  1398-02-30   10       3    

Extract orders from session information and save them in the dataframe order_df.

In [162]:
import re

session_order_fields = {
    "session_id": [],
    "order_id": [],
    "order_title": []}

session_order_df = pd.DataFrame(data=session_order_fields)

# Pattern to be used for removing the numbering html signs from order titles.
pattern = re.compile("^[0-9]*\r\n")

for json_file in json_files:
    # Opening JSON file
    json_file = open(path_to_json + json_file,'r') 

    # returns JSON object as a dictionary 
    dict_data = json.load(json_file)

    for order in dict_data["mashrooh"]:
        
        #Remove the numbering html marks from the order tilte.
        order_title = re.sub(pattern, '', dict_data["mashrooh"][order]["title"], count=0, flags=0)
        
        session_order_df.loc[session_order_df.shape[0]] = [
            dict_data["sessionInfo"]["session"],
            order[5:],
            order_title]

print(session_order_df.head(40))
session_order_df.to_csv("export_dataframes/session_orders.csv")

   session_id order_id                                        order_title
0         204        1                     اعلام رسميت جلسه و قرائت دستور
1         204        2                           تلاوت آياتي از قرآن مجيد
2         204        3  بيانات رئيس محترم مجلس شوراي اسلامي درخصوص خرو...
3         204        4  ارجاع طرح تشكيل وزارت ميراث فرهنگي، گردشگري و ...
4         204        5  ادامه رسيدگي به لايحه اصلاح قانون مبارزه با پو...
5         204        6  قرائت بيانيه نمايندگان مجلس شوراي اسلامي درخصو...
6         204        7  ناطقين جلسه آقايان: احمد همتي، سيدجواد حسيني‌ك...
7         204        8  تذكرات كتبي نمايندگان مجلس به مسئولان اجرايي كشور
8         204        9  تذكر آيين‌نامه‌اي و اخطار قانون اساسي نمايندگا...
9         204       10            اعلام ختم جلسه و تاريخ تشكيل جلسه آينده
10        300        1                     اعلام رسميت جلسه و قرائت دستور
11        300        2                           تلاوت آياتي از قرآن مجيد
12        300        3  بيانات رئيس مح

Extract discussions from every order and save them in the dataframe order_discussions.

In [166]:
order_discussions_fields = {
    "session": [],
    "order_id": [],
    "discussion_id": [],
    "orator_name": [],
    "discussion_type": [],
    "discussion_content": []}

order_discussions_df = pd.DataFrame(data=order_discussions_fields)

for json_file in json_files:
    # Opening JSON file
    json_file = open(path_to_json + json_file,'r') 

    # returns JSON object as a dictionary 
    dict_data = json.load(json_file)

    for order in dict_data["mashrooh"]:
        for discussion_indx, discussion in enumerate(dict_data["mashrooh"][order]["discussions"]):
            order_discussions_df.loc[order_discussions_df.shape[0]] = [
                dict_data["sessionInfo"]["session"],
                order[5:],
                discussion_indx,
                discussion[0],
                discussion[0],
                discussion[2]]

print(order_discussions_df["orator_name"].head(80))
# order_discussions_df.to_csv("export_dataframes/order_discussions.csv")

0              علی اردشیرلاریجانی
1     سیدامیرحسین قاضی زاده هاشمی
2              علی اردشیرلاریجانی
3            محمد اسماعیل خورشیدی
4              علی اردشیرلاریجانی
                 ...             
75             علی اردشیرلاریجانی
76                   غلامرضا کاتب
77                   غلامرضا کاتب
78             علی اردشیرلاریجانی
79    سیدامیرحسین قاضی زاده هاشمی
Name: orator_name, Length: 80, dtype: object
