In [84]:
import data_cleaning
import json
import pandas as pd
from langdetect import detect
import re
from tqdm import tqdm
import mwparserfromhell as mw

In [85]:
wikipedia_parsed = "../../data/json_files/grawitas_output/wikipedia_parsed.json"
wikidata_parsed = "../../data/json_files/grawitas_output/wikidata_parsed.json"
meta_parsed = "../../data/json_files/grawitas_output/meta_parsed.json"
with open(wikipedia_parsed) as f:
        wikipedia_list_of_dicts = json.load(f)  

with open(wikidata_parsed) as f:
        wikidata_list_of_dicts = json.load(f) 
        
with open(meta_parsed) as f:
        meta_list_of_dicts = json.load(f)  

In [86]:
wiki_comments, wiki_rfc = data_cleaning.get_RFC_Comment_Table(wikipedia_list_of_dicts, wikidata_list_of_dicts, meta_list_of_dicts)

In [87]:
comment_df = pd.json_normalize(wiki_comments, "page_text", ["page_title","page_id"])

In [88]:
rfc_df = pd.json_normalize(wiki_rfc)

In [89]:
with open("../../data/json_files/rfc_pages/meta.json") as f:
    meta = json.load(f)

In [90]:
for page in meta:
    text = page['page_text']
    wikicode = mw.parse(text)
    rfc_templates = wikicode.filter_templates(matches=lambda template: template.name.matches("rfc subpage"))
    #comment_value = rfc_templates[0].get("comment").value.strip()
    if "date" in rfc_templates[0]:
        date_value = rfc_templates[0].get("date").value.strip()
        rfc_df.loc[rfc_df['page_id'] == page['page_id'], 'closing_date'] = date_value


In [91]:
with open("../../data/json_files/rfc_pages/wikidata.json") as f:
    wikidata = json.load(f)

In [92]:
from datetime import datetime
for page in wikidata:
    text = wikidata[0]['page_text']
    wikicode = mw.parse(text)
    rfc_templates = wikicode.filter_templates(matches=lambda template: template.name.matches("discussion top"))
    #comment_value = rfc_templates[0].get("comment").value.strip()
    text = rfc_templates[0].params[0]
    date = re.search(r'\d{2}:\d{2}, \d{1,2} \w+ \d{4} \(UTC\)', str(text))
    # parse the input string into a datetime object
    input_datetime = datetime.strptime(date.group(), "%H:%M, %d %B %Y (%Z)")

    # format the datetime object in the desired output format
    output_str = input_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
    if output_str:
        rfc_df.loc[rfc_df['page_id'] == page['page_id'], 'closing_date'] = output_str

In [93]:
with open("../../data/json_files/rfc_pages/wikipedia.json") as f:
    wikipedia = json.load(f)

In [94]:
from datetime import datetime
for page in wikipedia:
    text = page['page_text']
    wikicode = mw.parse(text)
    rfc_templates = wikicode.filter_templates(matches=lambda template: template.name.matches("closed rfc top"))
    #comment_value = rfc_templates[0].get("comment").value.strip()
    if not rfc_templates:
        continue
    if not rfc_templates[0].params:
        continue
    text = rfc_templates[0].params[0]
    date = re.search(r'\d{2}:\d{2}, \d{1,2} \w+ \d{4} \(UTC\)', str(text))
    # parse the input string into a datetime object
    if not date:
        continue
    input_datetime = datetime.strptime(date.group(), "%H:%M, %d %B %Y (%Z)")

    # format the datetime object in the desired output format
    output_str = input_datetime.strftime("%Y-%m-%dT%H:%M:%SZ")
    if output_str:
        rfc_df.loc[rfc_df['page_id'] == page['page_id'], 'closing_date'] = output_str
    

In [95]:
data_cleaning.templatesToReadableText(comment_df)

100%|██████████| 102474/102474 [01:48<00:00, 940.44it/s] 


In [96]:
def detectLanguage(text):
    try:
        return detect(text)
    except:
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        url = re.findall(regex, text)
        if url:
            return "en"

In [97]:
# define regular expression pattern for matching non-word characters
non_word_pattern = re.compile(r'^\W*$')

non_alpha_pattern = re.compile(r'^[^a-zA-Z]+$')

# Define regex pattern
pattern = "\s*15px(?:\|[a-zA-Z]+=\s*)?(?:\|[a-zA-Z]+=)?(?:\|[a-zA-Z]+\s*)?"

comment_df['text'] = comment_df['text'].str.replace(pattern, '')

# filter dataframe to remove rows that contain only non-alpha characters
comment_df = comment_df[~comment_df['text'].str.contains(non_alpha_pattern)]

# filter dataframe to remove rows that contain only non-word characters
comment_df = comment_df[~comment_df['text'].str.contains(non_word_pattern)]

  comment_df['text'] = comment_df['text'].str.replace(pattern, '')


In [98]:
tqdm.pandas()
# Add a new column to the DataFrame indicating the language of the text
comment_df['language'] = comment_df['text'].progress_apply(detectLanguage)

comment_df = comment_df[comment_df['language'] == 'en']

100%|██████████| 101449/101449 [13:50<00:00, 122.13it/s]


In [99]:
gfg_csv_data = rfc_df.to_csv('../../data/rfc.csv', index = False)
print('\nCSV String:\n', gfg_csv_data) 


CSV String:
 None


In [100]:
#comment_df.loc[60187,'date'] = '2022-03-22T23:36:00Z'
gfg_csv_data = comment_df.to_csv('../../data/rfc_comments.csv', index = False)
print('\nCSV String:\n', gfg_csv_data) 


CSV String:
 None


In [83]:
comment_df = pd.read_csv('../../data/rfc_comments.csv')
def getcleanDataFrame(df):
    #remove IP-Adresses from userArray
    df = df[~df['user'].str.contains(re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'))]
    return list(df["user"].unique())

user_list = getcleanDataFrame(comment_df)

In [84]:
print(len(user_list))

9265


In [85]:
from userinformation import getUserInfoToJSON
getUserInfoToJSON(user_list, "../../data/json_files/user_info/users.json")

9265it [3:14:14,  1.26s/it]


In [77]:
# Group the dataframe by project type and rfc_id
grouped = comment_df.groupby(['project', 'rfc_id'])

# Define a function to remove the first 2 comments per different rfc_id
def remove_comments(group):
    if len(group) > 2:
        return group.iloc[2:]
    else:
        return pd.DataFrame()

# Apply the function to each group and concatenate the results
filtered = grouped.apply(remove_comments).reset_index(drop=True)

In [78]:
# group the dataframe by class and sample 65 rows from each group
df_labelling = filtered.groupby('project').apply(lambda x: x.sample(65)).reset_index(drop=True)

In [79]:
# Define a function to generate URLs based on page IDs
def get_wikipedia_url(row):
    page_id = row['page_id']
    project = row['project']
    return f'https://{project}.org/wiki?curid={page_id}'

# Apply the function to the page_id column and assign the result to a new column
df_labelling['page_url'] = df_labelling.apply(get_wikipedia_url, axis = 1)

In [80]:
# Create a list of strings to add as new columns
new_cols = [ 'disrespect','respect','explanation','causal_reasoning','narrative', 'question', 'response', 'advocacy', 'public_interest','counterarguments', 'constructive_proposal']

# Add the new columns to the DataFrame
for col in new_cols:
    df_labelling[col] = ''

In [81]:
df_labelling = df_labelling.drop(["date","section","page_id", "rfc_id", "parent_id", "language"], axis=1)

In [82]:
# saving the DataFrame as a CSV file
gfg_csv_data = df_labelling.to_csv('label_rfc_statements.csv', index = False)
print('\nCSV String:\n', gfg_csv_data) 


CSV String:
 None
