# Libraries + read function declaration

In [31]:
# For data preprocessing, aggregation.
import xmltodict # XML parser.
import json
import pandas as pd # To manage dataframes.
from bs4 import BeautifulSoup # HTML parser used to strip HTML from Posts.

# For data scraping.
from requests.sessions import Session
import requests
import scrape # Scaper from Wikipedia.

def xml_to_data_frame(file_path: str):
    with open(file_path, 'r', encoding="utf8") as f:
        dict = xmltodict.parse(f.read())
    
    key = list(dict)[0] # Get variable key.
    data = dict[key]['row'] # Used to access the data itself.
    df = pd.DataFrame.from_dict(data)

    print(f"\"{key}\" file successfully read and converted to dataframe.\n" + 
        f"Number of rows:{df.shape[0]}\n************************\n")
    
    return df

# Manual data cleaning & aggreation
Each dataset has to be manually processed.

## 1/6 - Comments

In [32]:
df_comments = xml_to_data_frame('Datasets/Comments.xml')

"comments" file successfully read and converted to dataframe.
Number of rows:174347
************************



In [33]:
df_comments.columns

Index(['@Id', '@PostId', '@Score', '@Text', '@CreationDate', '@UserId',
       '@ContentLicense', '@UserDisplayName'],
      dtype='object')

In [34]:
# We only want to keep Id, PostId, Text and UserId
df_comments = df_comments.reindex(columns=["@Id", "@PostId", "@Text","@UserId"])

# Renaming of the columns
df_comments = df_comments.rename(columns={"@Id": "id", "@PostId": "postId", "@Text": "text", "@UserId": "userId"})

df_comments.head(1)

Unnamed: 0,id,postId,text,userId
0,8,2,To show that they have the same expressive pow...,10


In [35]:
# Finally, write into json
df_json = df_comments.to_json(orient="records")
parsed = json.loads(df_json)

with open("Datasets/comments.json", "w", encoding="utf-8") as f:
    json.dump(parsed, f, indent=4)
    print("Comments dataset succesfully converted to JSON!")

## 2/6 - Posts

In [37]:
df_posts = xml_to_data_frame('Datasets/Posts.xml')

"posts" file successfully read and converted to dataframe.
Number of rows:92896
************************



In [38]:
df_posts.columns

Index(['@Id', '@PostTypeId', '@AcceptedAnswerId', '@CreationDate', '@Score',
       '@ViewCount', '@Body', '@OwnerUserId', '@LastEditorUserId',
       '@LastEditDate', '@LastActivityDate', '@Title', '@Tags', '@AnswerCount',
       '@CommentCount', '@FavoriteCount', '@ContentLicense', '@ParentId',
       '@OwnerDisplayName', '@ClosedDate', '@LastEditorDisplayName',
       '@CommunityOwnedDate'],
      dtype='object')

In [None]:
""" We want to keep Id, PostTypeI, AcceptedAnswerId, CreationDate, Score,
        ViewCount, Body, OwnerUserId, Title, Tags, AnsweCount, ParentIt
"""
df_posts = df_posts.reindex(columns=["@Id", "@PostId", "@Text","@UserId"])

# Renaming of the columns
df_posts = df_posts.rename(columns={"@Id": "id", "@PostId": "postId", "@Text": "text", "@UserId": "userId"})

df_posts.head(1)

In [None]:
# Finally, write into json
df_json = df_posts.to_json(orient="records")
parsed = json.loads(df_json)

with open("Datasets/comments.json", "w", encoding="utf-8") as f:
    json.dump(parsed, f, indent=4)
    print("Comments dataset succesfully converted to JSON!")

In [17]:
# For testing purpouses.
# WORKS.
# 18.6 secons to execute.
def test_xml_reading():
    list_of_xml = [
    'Datasets/Comments.xml',
    'Datasets/Posts.xml',
    'Datasets/Tags.xml',
    'Datasets/Users.xml',
    'Datasets/Votes.xml'
    ]

    for file in list_of_xml:
        xml_to_data_frame(file)

test_xml_reading()

"comments" file successfully read and converted to dataframe.
Number of rows:174347
************************

"posts" file successfully read and converted to dataframe.
Number of rows:92896
************************

"tags" file successfully read and converted to dataframe.
Number of rows:637
************************

"users" file successfully read and converted to dataframe.
Number of rows:119115
************************

"votes" file successfully read and converted to dataframe.
Number of rows:399089
************************

