In [1]:
import data_cleaning
import json
import pandas as pd
from langdetect import detect
import re
from tqdm import tqdm

In [2]:
wikipedia_parsed = "../../data/json_files/grawitas_output/wikipedia_parsed.json"
wikidata_parsed = "../../data/json_files/grawitas_output/wikidata_parsed.json"
meta_parsed = "../../data/json_files/grawitas_output/meta_parsed.json"
with open(wikipedia_parsed) as f:
        wikipedia_list_of_dicts = json.load(f)  

with open(wikidata_parsed) as f:
        wikidata_list_of_dicts = json.load(f) 
        
with open(meta_parsed) as f:
        meta_list_of_dicts = json.load(f)  

In [3]:
wiki_comments, wiki_rfc = data_cleaning.get_RFC_Comment_Table(wikipedia_list_of_dicts, wikidata_list_of_dicts, meta_list_of_dicts)

In [4]:
comment_df = pd.json_normalize(wiki_comments, "page_text", ["page_title","page_id"])

In [5]:
comment_df.head()

Unnamed: 0,text,user,date,section,parent_id,id,project,rfc_id,page_title,page_id
0,"{{closed rfc top|1=This RfC is now redundent, ...",Cinderella157,2019-10-11T09:04:00Z,RfC about use of YouTube video as primary sour...,0,0,wikipedia,0,Talk:Noah Kraft,55480246
1,\nShould a YouTube video [https://www.youtube....,BC1278,2019-09-05T17:34:00Z,RfC about use of YouTube video as primary sour...,0,1,wikipedia,0,Talk:Noah Kraft,55480246
2,"!Vote\n\n* I will not vote because of my COI, ...",BC1278,2019-09-10T15:07:00Z,RfC about use of YouTube video as primary sour...,0,2,wikipedia,0,Talk:Noah Kraft,55480246
3,*:{{ping|BC1278}} I'm still seeing the YouTube...,Buffs,2019-09-10T15:42:00Z,RfC about use of YouTube video as primary sour...,0,3,wikipedia,0,Talk:Noah Kraft,55480246
4,{{ping|Buffs}} You are correct. I have amended...,BC1278,2019-09-10T17:35:00Z,RfC about use of YouTube video as primary sour...,3,4,wikipedia,0,Talk:Noah Kraft,55480246


In [6]:
rfc_df = pd.json_normalize(wiki_rfc)

In [7]:
data_cleaning.templatesToReadableText(comment_df)

100%|██████████| 102474/102474 [00:52<00:00, 1953.99it/s]


In [8]:
def detectLanguage(text):
    try:
        return detect(text)
    except:
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        url = re.findall(regex, text)
        if url:
            return "en"

In [9]:
# define regular expression pattern for matching non-word characters
non_word_pattern = re.compile(r'^\W*$')

non_alpha_pattern = re.compile(r'^[^a-zA-Z]+$')

# Define regex pattern
pattern = '\s*15px\|[a-zA-Z]+=\s*\|[a-zA-Z]+=[a-zA-Z]+\s*\|'

In [10]:
tqdm.pandas()

comment_df['text'] = comment_df['text'].str.replace(pattern, '')

# filter dataframe to remove rows that contain only non-alpha characters
comment_df = comment_df[~comment_df['text'].str.contains(non_alpha_pattern)]

# filter dataframe to remove rows that contain only non-word characters
comment_df = comment_df[~comment_df['text'].str.contains(non_word_pattern)]

# Add a new column to the DataFrame indicating the language of the text
comment_df['language'] = comment_df['text'].progress_apply(detectLanguage)

comment_df = comment_df[comment_df['language'] == 'en']

  comment_df['text'] = comment_df['text'].str.replace(pattern, '')
100%|██████████| 101449/101449 [07:15<00:00, 232.72it/s]


In [11]:
gfg_csv_data = rfc_df.to_csv('../../data/rfc.csv', index = False)
print('\nCSV String:\n', gfg_csv_data) 


CSV String:
 None


In [12]:
#comment_df.loc[60187,'date'] = '2022-03-22T23:36:00Z'
gfg_csv_data = comment_df.to_csv('../../data/rfc_comments.csv', index = False)
print('\nCSV String:\n', gfg_csv_data) 


CSV String:
 None


In [None]:
comment_df = pd.read_csv('../../data/rfc_comments.csv')
def getcleanDataFrame(df):
    #remove IP-Adresses from userArray
    df = df[~df['user'].str.contains(re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'))]
    return list(df["user"].unique())

user_list = getcleanDataFrame(comment_df)

In [None]:
print(len(user_list))

In [None]:
def getUserInfoToJSON(userArray, output):
    """
    Takes list of users outputs list of JSON objects containing wiki projects, user rights, edit count, registration and first comment dates
    """ 
    with multiprocessing.Pool(processes=8) as pool:
        results = list(tqdm.tqdm(pool.imap(worker, userArray)))
        with open(output, 'w') as file:
            json.dump(list(results), file)
    
    
def worker(user):
    try:
        userDic = getUserInfoAcrossAllReplicaDatabases(user)
        return userDic
    except:
        print(f"failed to get userDic, with {user}")

In [None]:
from userinformation import getUserInfoToJSON
getUserInfoToJSON(user_list, "../json_files/user_info/users.json")

In [None]:
comment_df = pd.read_csv('rfc_comments.csv')

In [None]:
# group the dataframe by class and sample 65 rows from each group
df_labelling = comment_df.groupby('project').apply(lambda x: x.sample(65)).reset_index(drop=True)

In [None]:
# Define a function to generate URLs based on page IDs
def get_wikipedia_url(row):
    page_id = row['page_id']
    project = row['project']
    return f'https://{project}.org/wiki?curid={page_id}'

# Apply the function to the page_id column and assign the result to a new column
df_labelling['page_url'] = df_labelling.apply(get_wikipedia_url, axis = 1)

In [None]:
# Create a list of strings to add as new columns
new_cols = [ 'disrespect','respect','explanation','causal_reasoning','narrative', 'question', 'response', 'advocacy', 'public_interest','counterarguments', 'constructive_proposal']

# Add the new columns to the DataFrame
for col in new_cols:
    df_labelling[col] = ''

In [None]:
df_labelling = df_labelling.drop(["date","section","page_id", "rfc_id", "parent_id", "language"], axis=1)

In [None]:
# saving the DataFrame as a CSV file
gfg_csv_data = df_labelling.to_csv('label_rfc_statements.csv', index = False)
print('\nCSV String:\n', gfg_csv_data) 