In [1]:
import data_cleaning
import json
import pandas as pd
from langdetect import detect
import re
from tqdm import tqdm
import mwparserfromhell as mw

In [2]:
wikipedia_parsed = "../../data/json_files/grawitas_output/wikipedia_parsed.json"
wikidata_parsed = "../../data/json_files/grawitas_output/wikidata_parsed.json"
meta_parsed = "../../data/json_files/grawitas_output/meta_parsed.json"
with open(wikipedia_parsed) as f:
        wikipedia_list_of_dicts = json.load(f)  
f.close()

with open(wikidata_parsed) as f:
        wikidata_list_of_dicts = json.load(f) 
f.close()       
with open(meta_parsed) as f:
        meta_list_of_dicts = json.load(f)  
f.close()

In [3]:
meta_list_of_dicts = [page for page in meta_list_of_dicts if page["page_text"] is not None]
meta_list_of_dicts = [page for page in meta_list_of_dicts if type(page["page_text"]) is list]

In [4]:
wikipedia_list_of_dicts = [page for page in wikipedia_list_of_dicts if page["page_text"] is not None]
wikipedia_list_of_dicts = [page for page in wikipedia_list_of_dicts if type(page["page_text"]) is list]

In [5]:
wikidata_list_of_dicts = [page for page in wikidata_list_of_dicts if page["page_text"] is not None]
wikidata_list_of_dicts = [page for page in wikidata_list_of_dicts if type(page["page_text"]) is list]

In [6]:
wiki_comments, wiki_rfc = data_cleaning.get_RFC_Comment_Table(wikipedia_list_of_dicts, wikidata_list_of_dicts, meta_list_of_dicts)

In [7]:
comment_df = pd.json_normalize(wiki_comments, "page_text", ["page_title","page_id"])

In [8]:
rfc_df = pd.json_normalize(wiki_rfc)

## Get Closing Date Info

In [9]:
with open("../../data/json_files/rfc_pages/meta.json") as f:
    meta = json.load(f)

In [10]:
for page in meta:
    text = page['page_text']
    wikicode = mw.parse(text)
    rfc_templates = wikicode.filter_templates(matches=lambda template: template.name.matches("rfc subpage"))
    #comment_value = rfc_templates[0].get("comment").value.strip()
    if "date" in rfc_templates[0]:
        date_value = rfc_templates[0].get("date").value.strip()
        rfc_df.loc[rfc_df['page_id'] == page['page_id'], 'closing_date'] = date_value


In [11]:
with open("../../data/json_files/rfc_pages/wikidata.json") as f:
    wikidata = json.load(f)

In [12]:
from datetime import datetime
for page in wikidata:
    text = wikidata[0]['page_text']
    wikicode = mw.parse(text)
    rfc_templates = wikicode.filter_templates(matches=lambda template: template.name.matches("discussion top"))
    #comment_value = rfc_templates[0].get("comment").value.strip()
    text = rfc_templates[0].params[0]
    date = re.search(r'\d{2}:\d{2}, \d{1,2} \w+ \d{4} \(UTC\)', str(text))
    # parse the input string into a datetime object
    input_datetime = datetime.strptime(date.group(), "%H:%M, %d %B %Y (%Z)")

    # format the datetime object in the desired output format
    output_str = input_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
    if output_str:
        rfc_df.loc[rfc_df['page_id'] == page['page_id'], 'closing_date'] = output_str

In [13]:
with open("../../data/json_files/rfc_pages/wikipedia.json") as f:
    wikipedia = json.load(f)

In [14]:
from datetime import datetime
for page in wikipedia:
    text = page['page_text']
    wikicode = mw.parse(text)
    rfc_templates = wikicode.filter_templates(matches=lambda template: template.name.matches("closed rfc top"))
    #comment_value = rfc_templates[0].get("comment").value.strip()
    if not rfc_templates:
        continue
    if not rfc_templates[0].params:
        continue
    text = rfc_templates[0].params[0]
    date = re.search(r'\d{2}:\d{2}, \d{1,2} \w+ \d{4} \(UTC\)', str(text))
    # parse the input string into a datetime object
    if not date:
        continue
    input_datetime = datetime.strptime(date.group(), "%H:%M, %d %B %Y (%Z)")

    # format the datetime object in the desired output format
    output_str = input_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
    if output_str:
        rfc_df.loc[rfc_df['page_id'] == page['page_id'], 'closing_date'] = output_str
    

## Convert Templates to readable text

In [15]:
data_cleaning.templatesToReadableText(comment_df)

100%|██████████| 98839/98839 [49:28<00:00, 33.30it/s]   


## Remove all nonenglish comments and non text comments

In [16]:
def detectLanguage(text):
    try:
        return detect(text)
    except:
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        url = re.findall(regex, text)
        if url:
            return "en"

In [17]:
# define regular expression pattern for matching non-word characters
non_word_pattern = re.compile(r'^\W*$')

non_alpha_pattern = re.compile(r'^[^a-zA-Z]+$')

# Define regex pattern
pattern = "\s*15px(?:\|[a-zA-Z]+=\s*)?(?:\|[a-zA-Z]+=)?(?:\|[a-zA-Z]+\s*)?"

comment_df['text'] = comment_df['text'].str.replace(pattern, '')

# filter dataframe to remove rows that contain only non-alpha characters
comment_df = comment_df[~comment_df['text'].str.contains(non_alpha_pattern)]

# filter dataframe to remove rows that contain only non-word characters
comment_df = comment_df[~comment_df['text'].str.contains(non_word_pattern)]

  comment_df['text'] = comment_df['text'].str.replace(pattern, '')


In [18]:
tqdm.pandas()
# Add a new column to the DataFrame indicating the language of the text
comment_df['language'] = comment_df['text'].progress_apply(detectLanguage)

comment_df = comment_df[comment_df['language'] == 'en']

100%|██████████| 97690/97690 [04:52<00:00, 333.51it/s]


## Saving data to csv files

In [19]:
gfg_csv_data = rfc_df.to_csv('../../data/rfc.csv', index = False)
print('\nCSV String:\n', gfg_csv_data)


CSV String:
 None


In [20]:
#comment_df.loc[60187,'date'] = '2022-03-22T23:36:00Z'
gfg_csv_data = comment_df.to_csv('../../data/rfc_comments.csv', index = False)
print('\nCSV String:\n', gfg_csv_data)


CSV String:
 None
