## Generate labelling csv file

In [3]:
import pandas as pd

In [4]:
comment_df = pd.read_csv('../data/rfc_comments.csv')
# Group the dataframe by project type and rfc_id
grouped = comment_df.groupby(['project', 'rfc_id'])

# Define a function to remove the first 2 comments per different rfc_id
def remove_comments(group):
    if len(group) > 2:
        return group.iloc[2:]
    else:
        return pd.DataFrame()

# Apply the function to each group and concatenate the results
filtered = grouped.apply(remove_comments).reset_index(drop=True)

In [5]:
# group the dataframe by class and sample 65 rows from each group
df_labelling = filtered.groupby('project').apply(lambda x: x.sample(65)).reset_index(drop=True)

In [6]:
# Define a function to generate URLs based on page IDs
def get_wikipedia_url(row):
    page_id = int(row['page_id'])
    project = row['project']
    return f'https://{project}.org/wiki?curid={page_id}'

# Apply the function to the page_id column and assign the result to a new column
df_labelling['page_url'] = df_labelling.apply(get_wikipedia_url, axis = 1)

In [7]:
# Create a list of strings to add as new columns
new_cols = [ 'disrespect','respect','explanation','causal_reasoning','narrative', 'question', 'response', 'advocacy', 'public_interest','counterarguments', 'constructive_proposal']

# Add the new columns to the DataFrame
for col in new_cols:
    df_labelling[col] = ''

In [8]:
df_labelling = df_labelling.drop(["date","section","page_id", "rfc_id", "parent_id", "language"], axis=1)

In [10]:
df_labelling = pd.read_csv('label_rfc_statements.csv')
df_labelling['id'] = df_labelling['id'].astype(int)

In [11]:
# saving the DataFrame as a CSV file
gfg_csv_data = df_labelling.to_csv('label_rfc_statements.csv', index = False)
print('\nCSV String:\n', gfg_csv_data)


CSV String:
 None


In [26]:
df_comments = pd.read_csv('../data/rfc_comments.csv')

In [13]:
df_comments.head()

Unnamed: 0,date,id,parent_id,section,text,user,project,rfc_id,page_title,page_id,language
0,2020-10-06T13:22:00Z,0,0,,There is a disagreement regarding the defintio...,Robynthehode,wikipedia,0,Talk:Skyscraper,29488,en
1,2020-10-07T06:22:00Z,1,0,==Summary of discussion==,The main contention is whether to include a si...,Robynthehode,wikipedia,0,Talk:Skyscraper,29488,en
2,2020-10-06T16:03:00Z,2,1,==Summary of discussion==,"It is not our purpose to define a term, but to...",Redrose64,wikipedia,0,Talk:Skyscraper,29488,en
3,2020-10-06T16:27:00Z,3,2,==Summary of discussion==,Thanks for your response. All the editors cont...,Robynthehode,wikipedia,0,Talk:Skyscraper,29488,en
4,2020-10-06T18:15:00Z,4,3,==Summary of discussion==,"What, all 32,000+ characters (excluding sigs a...",Redrose64,wikipedia,0,Talk:Skyscraper,29488,en


In [27]:
# sort the dataframe based on id and parent_id
df_out = df_comments.sort_values(['id', 'parent_id'])

In [28]:
df_out = df_out.drop(["date","section","page_id", "id", "rfc_id", "parent_id", "language", "project", "page_title"], axis=1)

In [20]:
df_out.head()

Unnamed: 0,text,user
0,There is a disagreement regarding the defintio...,Robynthehode
1,The main contention is whether to include a si...,Robynthehode
2,"It is not our purpose to define a term, but to...",Redrose64
3,Thanks for your response. All the editors cont...,Robynthehode
4,"What, all 32,000+ characters (excluding sigs a...",Redrose64


In [29]:
df_out = df_out[['user','text']]

In [30]:
# change the name of the 'old_name' column to 'new_name'
df_out = df_out.rename(columns={'user': 'speaker'})
df_out = df_out.rename(columns={'text': 'speech'})

In [31]:
df_out.head()

Unnamed: 0,speaker,speech
0,Robynthehode,There is a disagreement regarding the defintio...
1,Robynthehode,The main contention is whether to include a si...
2,Redrose64,"It is not our purpose to define a term, but to..."
3,Robynthehode,Thanks for your response. All the editors cont...
4,Redrose64,"What, all 32,000+ characters (excluding sigs a..."


In [32]:
# saving the DataFrame as a CSV file
gfg_csv_data = df_out.to_csv('rfc_statements.csv', index = False)
print('\nCSV String:\n', gfg_csv_data)


CSV String:
 None


In [4]:
delib_df = pd.read_csv('../data/rfc_statements_delibanalysis.csv')

In [5]:
delib_df = delib_df.drop('between_2000_and_3000_chars', axis=1)
delib_df = delib_df.drop('between_1000_and_2000_chars', axis=1)
delib_df = delib_df.drop('between_3000_and_4000_chars', axis=1)
delib_df = delib_df.drop('more_than_4000_chars', axis=1)
delib_df = delib_df.drop('less_than_1000_chars', axis=1)
delib_df = delib_df.drop('respect', axis=1)
delib_df = delib_df.drop('cleaned_comment', axis=1)

In [37]:
delib_df.head()

Unnamed: 0,speaker,speech,char_count,has_respect,has_question,has_question_parent,gender,pos,narrative,question,response,advocacy,public_interest,disrespect,explanation,causal_reasoning,counterarguments,constructive_proposal
0,Robynthehode,There is a disagreement regarding the defintio...,409,0,0,0,M,EX VBZ DT NN VBG DT NN IN DT NN NN RB DT NN NN...,0.0,0,0.0,1.0,0.0,0,0.0,0.0,0.0,0.0
1,Robynthehode,The main contention is whether to include a si...,571,0,0,0,M,DT JJ NN VBZ IN TO VB DT JJ NN CC RB NN NN NN ...,0.0,0,1.0,1.0,0.0,0,0.0,1.0,0.0,1.0
2,Redrose64,"It is not our purpose to define a term, but to...",630,0,0,0,M,PRP VBZ RB PRP$ NN TO VB DT NN CC TO VB IN WRB...,0.0,0,0.0,1.0,0.0,0,0.0,1.0,0.0,1.0
3,Robynthehode,Thanks for your response. All the editors cont...,315,1,0,0,M,NNS IN PRP$ NN PDT DT NNS VBG RB RB VBP IN JJ ...,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
4,Redrose64,"What, all 32,000+ characters (excluding sigs a...",236,0,1,0,M,WP DT NNS VBG NNS CC VB NN IN NN NN DT NN NN I...,0.0,1,1.0,0.0,0.0,0,1.0,0.0,0.0,1.0


In [6]:
df_comments = pd.read_csv('../data/rfc_comments.csv')
# sort the dataframe based on id and parent_id
df_out = df_comments.sort_values(['id', 'parent_id'])

In [7]:
df3 = pd.merge(delib_df, df_out, left_index=True, right_index=True)

In [8]:
df3 = df3.drop('pos', axis=1)
df3 = df3.drop('language', axis=1)

In [9]:
df3 = df3.drop('speech', axis=1)

In [10]:
gfg_csv_data = df3.to_csv('final_rfc_statements.csv', index = False)
print('\nCSV String:\n', gfg_csv_data)


CSV String:
 None
