In [1]:
import json
import pandas as pd
import numpy as np
import mwparserfromhell
def find_matching_closed_rfc_tags(text):
    """
    Find the positions of the {{closed rfc top}} and {{closed rfc bottom}} tags in a text and return the position of the
    first matching pair of tags.
    :param text: The text to search for tags.
    :return: A tuple of integers representing the positions of the top and bottom tags for the first matching pair, or None
    if no matching pair is found.
    """
    top_tag = '{{closed rfc top'
    bottom_tag = '{{closed rfc bottom}}'
    top_tag_pos = None
    bottom_tag_pos = None
    open_tags = []
    for x, comment in enumerate(text):
        lower = comment['text'].lower()
        for i, char in enumerate(lower):
                    if char == '{' and lower[i:i+len(top_tag)] == top_tag:
                        open_tags.append(x)
                    elif char == '{' and lower[i:i+len(bottom_tag)] == bottom_tag:
                        if open_tags:
                            top_tag_pos = open_tags.pop()
                            bottom_tag_pos = x
                            if not open_tags:
                                return (top_tag_pos, bottom_tag_pos)                     
    if len(open_tags) == 1:
        return open_tags

In [2]:
wikipedia_parsed = "../json_files/grawitas_output/wikipedia_parsed.json"
with open(wikipedia_parsed) as f:
        wikipedia_list_of_dicts = json.load(f)  

In [8]:
df = pd.json_normalize(wikipedia_list_of_dicts, "page_text", ["page_title","page_id"])

In [14]:
for page in wikipedia_list_of_dicts:
    if page['page_id'] == 61466044:
        print(True)
    if page['page_text'] == None:
        wikipedia_list_of_dicts.remove(page)
    else:
        # convert json object to tuple
        page_id = (page['page_id'],str(page['page_text']))
        # check if the tuple is already in the set
        if page_id in unique_objs:
            # remove duplicate object
            wikipedia_list_of_dicts.remove(page)
        else:
            # add the tuple to the set of unique objects
            unique_objs.add(page_id)

In [8]:
#removing duplicates
# create a set to store unique tuples of objects
unique_objs = set()

# loop through the list of json objects
for i, obj in enumerate(wikipedia_list_of_dicts):
    if page['page_id'] == 61466044:
        print(True)
    if obj['page_text'] == None:
        wikipedia_list_of_dicts.pop(i)
    else:
        # convert json object to tuple
        page_id = (obj['page_id'],str(obj['page_text']))
        # check if the tuple is already in the set
        if page_id in unique_objs:
            # remove duplicate object
            wikipedia_list_of_dicts.pop(i)
        else:
            # add the tuple to the set of unique objects
            unique_objs.add(page_id)

In [None]:
"""
Checking if comments on page are in the closed rfc
"""
rf_table = {}
loc = None
count = 0
comment_counter = 0
for page in wikipedia_list_of_dicts:
    if page['page_id'] == 61466044:
        print('yes')
    loc = find_matching_closed_rfc_tags(page['page_text'])
    if loc == None:
        wikipedia_list_of_dicts.remove(page)
        continue
    elif len(loc) == 1:
        page['page_text'] = page['page_text'][loc[0]:]
    elif len(loc) == 2:
        page['page_text'] = page['page_text'][loc[0]:loc[1]]

    rf_table[count] = {"discussion_title" : page['page_text'][0]['section'], "discussion_result_comment_id" : comment_counter, "discussion_input_comment" : comment_counter+1}

    dif =  page['page_text'][0]['id'] - comment_counter
    for i, comment in enumerate(page['page_text']):
        comment['rfc_id'] = int(count)
        comment['id'] = comment_counter
        comment['project'] = 'wikipedia'

        if comment['parent_id'] != 0:
            comment['parent_id'] = comment['parent_id'] - dif
        comment_counter = comment_counter + 1
    count = count + 1
    

In [None]:
df = pd.json_normalize(wikipedia_list_of_dicts, "page_text", ["page_title","page_id"])

In [None]:
rfc_df = pd.DataFrame(rf_table)
rfc_df.head()

In [None]:
import requests

from bs4 import BeautifulSoup

def getTemplateVisualText(template):
    """
    Gets the HTML for a template that is displayed and returns as simple text
    """
    S = requests.Session()

    URL = "https://en.wikipedia.org/w/api.php"

    PARAMS = {
        "action": "expandtemplates",
        "text": f'{template}',
        "prop": "wikitext",
        "format": "json"
    }

    R = S.get(url=URL, params=PARAMS)
    if R: 
        DATA = R.json()
        soup = BeautifulSoup(DATA['expandtemplates']['wikitext'], 'html.parser')
        return soup.get_text()
    else:
        return str(template)

In [None]:
df1 = df

In [None]:
template_dic = {}

In [None]:
from tqdm import tqdm
for i in tqdm(range(0,len(df1))):
    text = df1.at[i,'text']
    templates = mwparserfromhell.parse(text).filter_templates()
    for template in templates:
        string = template.__str__()
        if string not in template_dic:
            template_html = getTemplateVisualText(string)
            template_dic[string] = template_html
            text = text.replace(string, template_html)
        else:
            text = text.replace(string, template_dic[string])
    df1.at[i,'text'] = mwparserfromhell.parse(text).strip_code(collapse = True,keep_template_params=False)

In [None]:
df1.head()

In [None]:
df_labelling = df1

In [None]:
# Define a function to generate URLs based on page IDs
def get_wikipedia_url(page_id):
    return f'https://en.wikipedia.org/wiki?curid={page_id}'

# Apply the function to the page_id column and assign the result to a new column
df_labelling['page_url'] = df_labelling['page_id'].apply(get_wikipedia_url)

In [None]:
# Create a list of strings to add as new columns
new_cols = ['narrative', 'question', 'response', 'advocacy', 'public_interest',
    'respect', 'explanation', 'causal_reasoning','counterarguments', 'constructive_proposal']

# Add the new columns to the DataFrame
for col in new_cols:
    df_labelling[col] = ''

In [None]:
df_labelling = df_labelling.drop(["date","section","id","page_id"], axis=1)

In [None]:
# saving the DataFrame as a CSV file
gfg_csv_data = df_labelling.to_csv('original_rfc_statements.csv', index = False)
print('\nCSV String:\n', gfg_csv_data) 