In [26]:
import data_cleaning
import json
import pandas as pd
from langdetect import detect
import re
from tqdm import tqdm
import mwparserfromhell as mw

In [27]:
wikipedia_parsed = "../../data/json_files/grawitas_output/wikipedia_parsed.json"
wikidata_parsed = "../../data/json_files/grawitas_output/wikidata_parsed.json"
meta_parsed = "../../data/json_files/grawitas_output/meta_parsed.json"
with open(wikipedia_parsed) as f:
        wikipedia_list_of_dicts = json.load(f)  
f.close()

with open(wikidata_parsed) as f:
        wikidata_list_of_dicts = json.load(f) 
f.close()       
with open(meta_parsed) as f:
        meta_list_of_dicts = json.load(f)  
f.close()

In [28]:
meta_list_of_dicts = [page for page in meta_list_of_dicts if page["page_text"] is not None]
meta_list_of_dicts = [page for page in meta_list_of_dicts if type(page["page_text"]) is list]

In [29]:
wikipedia_list_of_dicts = [page for page in wikipedia_list_of_dicts if page["page_text"] is not None]
wikipedia_list_of_dicts = [page for page in wikipedia_list_of_dicts if type(page["page_text"]) is list]

In [30]:
wikidata_list_of_dicts = [page for page in wikidata_list_of_dicts if page["page_text"] is not None]
wikidata_list_of_dicts = [page for page in wikidata_list_of_dicts if type(page["page_text"]) is list]

In [31]:
wiki_comments, wiki_rfc = data_cleaning.get_RFC_Comment_Table(wikipedia_list_of_dicts, wikidata_list_of_dicts, meta_list_of_dicts)

In [32]:
comment_df = pd.json_normalize(wiki_comments, "page_text", ["page_title","page_id"])

In [33]:
rfc_df = pd.json_normalize(wiki_rfc)

## Get Closing Date Info

In [34]:
with open("../../data/json_files/rfc_pages/meta.json") as f:
    meta = json.load(f)

In [35]:
for page in meta:
    text = page['page_text']
    wikicode = mw.parse(text)
    rfc_templates = wikicode.filter_templates(matches=lambda template: template.name.matches("rfc subpage"))
    #comment_value = rfc_templates[0].get("comment").value.strip()
    if "date" in rfc_templates[0]:
        date_value = rfc_templates[0].get("date").value.strip()
        rfc_df.loc[rfc_df['page_id'] == page['page_id'], 'closing_date'] = date_value


In [36]:
with open("../../data/json_files/rfc_pages/wikidata.json") as f:
    wikidata = json.load(f)

In [37]:
from datetime import datetime
for page in wikidata:
    text = page['page_text']
    wikicode = mw.parse(text)
    rfc_templates = wikicode.filter_templates(matches=lambda template: template.name.matches("discussion top"))
    #comment_value = rfc_templates[0].get("comment").value.strip()
    if not rfc_templates:
        continue
    if rfc_templates[0].params:
        text = rfc_templates[0].params[0]
    elif len(rfc_templates) == 2:
        if rfc_templates[1].params:
            text = rfc_templates[1].params[0]
    else:
        continue
    date = re.search(r'\d{2}:\d{2}, \d{1,2} \w+ \d{4} \(UTC\)', str(text))
    if date:
        # parse the input string into a datetime object
        input_datetime = datetime.strptime(date.group(), "%H:%M, %d %B %Y (%Z)")

        # format the datetime object in the desired output format
        output_str = input_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
        if output_str:
            rfc_df.loc[rfc_df['page_id'] == page['page_id'], 'closing_date'] = output_str
    else: 
        continue

["{{discussion top|'''I belive that we have reach now a strong concensus: pages of all namespaces exept User: are allowed into Wikidata''' [[User:Tpt|Tpt]] ([[User talk:Tpt|talk]]) 16:08, 1 February 2013 (UTC) }}"]
[]
['{{Discussion top|Archived, see [[Wikidata:Project chat]] for a summary of what has come out of this. [[User:Ajraddatz|Ajraddatz]] <small>([[User Talk:Ajraddatz|Talk]])</small> 18:42, 22 January 2013 (UTC)}}']
['{{discussion top|Duplicate of [[Wikidata:Requests for comment/Items for Wikimedia projects besides Wikipedia]]. Please discuss this there. [[User:Legoktm|Legoktm]] ([[User talk:Legoktm|talk]]) 20:42, 1 May 2013 (UTC)}}']
["{{discussion top| This RFC has generated no comment, and the proposer also did not receive a response to a similar query/suggestion on Project Chat ([[Wikidata:Project_chat/Archive/2013/02|February archive]]). Clearly the community has had little response to the suggestion, which may have been viewed as outside the scope of Wikidata, and/or not

In [38]:
with open("../../data/json_files/rfc_pages/wikipedia.json") as f:
    wikipedia = json.load(f)

In [39]:
from datetime import datetime
for page in wikipedia:
    text = page['page_text']
    wikicode = mw.parse(text)
    rfc_templates = wikicode.filter_templates(matches=lambda template: template.name.matches("closed rfc top"))
    #comment_value = rfc_templates[0].get("comment").value.strip()
    if not rfc_templates:
        continue
    if not rfc_templates[0].params:
        continue
    text = rfc_templates[0].params[0]
    date = re.search(r'\d{2}:\d{2}, \d{1,2} \w+ \d{4} \(UTC\)', str(text))
    # parse the input string into a datetime object
    if not date:
        continue
    input_datetime = datetime.strptime(date.group(), "%H:%M, %d %B %Y (%Z)")

    # format the datetime object in the desired output format
    output_str = input_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
    if output_str:
        rfc_df.loc[rfc_df['page_id'] == page['page_id'], 'closing_date'] = output_str
    

## Convert Templates to readable text

In [40]:
data_cleaning.templatesToReadableText(comment_df)

100%|██████████| 98839/98839 [01:34<00:00, 1048.89it/s]


## Remove all nonenglish comments and non text comments

In [41]:
def detectLanguage(text):
    try:
        return detect(text)
    except:
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        url = re.findall(regex, text)
        if url:
            return "en"

In [42]:
# define regular expression pattern for matching non-word characters
non_word_pattern = re.compile(r'^\W*$')

non_alpha_pattern = re.compile(r'^[^a-zA-Z]+$')

# Define regex pattern
pattern = "\s*15px(?:\|[a-zA-Z]+=\s*)?(?:\|[a-zA-Z]+=)?(?:\|[a-zA-Z]+\s*)?"

comment_df['text'] = comment_df['text'].str.replace(pattern, '')

# filter dataframe to remove rows that contain only non-alpha characters
comment_df = comment_df[~comment_df['text'].str.contains(non_alpha_pattern)]

# filter dataframe to remove rows that contain only non-word characters
comment_df = comment_df[~comment_df['text'].str.contains(non_word_pattern)]

  comment_df['text'] = comment_df['text'].str.replace(pattern, '')


In [43]:
tqdm.pandas()
# Add a new column to the DataFrame indicating the language of the text
comment_df['language'] = comment_df['text'].progress_apply(detectLanguage)

comment_df = comment_df[comment_df['language'] == 'en']

100%|██████████| 97690/97690 [13:18<00:00, 122.28it/s]


## Saving data to csv files

In [44]:
gfg_csv_data = rfc_df.to_csv('../../data/rfc.csv', index = False)
print('\nCSV String:\n', gfg_csv_data)


CSV String:
 None


In [45]:
comment_df.loc[50754,'date'] = '2022-03-22T23:36:00Z'
gfg_csv_data = comment_df.to_csv('../../data/rfc_comments.csv', index = False)
print('\nCSV String:\n', gfg_csv_data)


CSV String:
 None
