In [1]:
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
filenames = ['Badges', 'Comments', 'PostHistory', 'PostLinks', 'Posts', 'Tags', 'Users', 'Votes']
data = {}
for name in filenames:
    with open(f'./data/{name}.xml', encoding="utf8") as f:
        soup = BeautifulSoup(f.read(), 'lxml-xml')
        data[name] = soup.find_all('row')

In [3]:
badges_df = pd.DataFrame([[d['Id'], d['UserId'], d['Class'], d['Name'], d['TagBased'], d['Date']] for d in data['Badges']])
badges_df.to_csv("C:/temp/badges.csv", sep = ",", index=False, header=None) # Convert to CSV

In [4]:
comments_df = pd.DataFrame([[d['Id'], d['PostId'], d.get('UserId', pd.NA), d.get('Score', 0),
                             d['ContentLicense'], d.get('Text', pd.NA), d['CreationDate']] for d in data['Comments']])
comments_df.to_csv("C:/temp/comments.csv", sep = ",", index=False, header=None) # Convert to CSV

In [5]:
history_df = pd.DataFrame([[d['Id'], d['PostId'], d.get('UserId', pd.NA), 
                            d['PostHistoryTypeId'], d.get('ContentLicense', pd.NA),
                            d.get('RevisionGUID', pd.NA), d.get('Text', pd.NA),
                            d.get('Comment', pd.NA), d['CreationDate'] ] for d in data['PostHistory']])
history_df.to_csv("C:/temp/history.csv", sep = ",", index=False, header=None) # Convert to CSV

In [6]:
links_df = pd.DataFrame([[d['Id'], d.get('RelatedPostId', pd.NA),
                          d['PostId'], d['LinkTypeId'], d['CreationDate']] for d in data['PostLinks']])
links_df.to_csv("C:/temp/links.csv", sep = ",", index=False, header=None) # Convert to CSV

In [7]:
posts_dict = {}
for d in data['Posts']:
    posts_dict[int(d['Id'])] = [d['Id'], d.get('OwnerUserId',pd.NA), d.get('LastEditorUserId',pd.NA), d['PostTypeId'],
                          d.get('AcceptedAnswerId',pd.NA), d.get('Score', 0), d.get('Parentid',pd.NA),
                          d.get('ViewCount', 0), d.get('AnswerCount', 0), d.get('CommentCount',0),
                          d.get('Title',pd.NA), d.get('Tags',pd.NA), d.get('ContentLicense', 'CC BY-SA 2.5'),
                          d.get('Body',pd.NA), d.get('FavoriteCount', 0), d['CreationDate'],
                          d.get('CommunityOwnedDate',pd.NA), d.get('ClosedDate',pd.NA), d.get('LastEditDate',pd.NA),
                          d.get('LastActivityDate',pd.NA)]
for post_id in range(max(posts_dict.keys())):
    if post_id not in posts_dict:
        posts_dict[post_id] = [str(post_id), pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA,
                                             pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA,
                                             pd.NA, pd.NA, pd.NA, pd.NA] # Pad in deleted posts for FK constraint 
posts_df = pd.DataFrame(posts_dict.values()) # Convert to DF
posts_df.to_csv("C:/temp/posts.csv", sep = ",", index=False, header=None) # Convert to CSV

In [8]:
tags_df = pd.DataFrame([[d['Id'], d.get('ExcerptPostId', pd.NA),
                         d.get('WikiPostId', pd.NA), d['TagName'], d.get('Count', 0)] for d in data['Tags']])
tags_df.to_csv("C:/temp/tags.csv", sep = ",", index=False, header=None) # Convert to CSV

In [9]:
users_df = pd.DataFrame([[d['Id'], d.get('AccountId', pd.NA), d['Reputation'], d.get('Views', pd.NA),
                          d.get('Downvotes', 0), d.get('Upvotes', 0), d['DisplayName'],
                          d.get('Location', pd.NA), d.get('ProfileImageUrl', pd.NA), d.get('WebsiteUrl', pd.NA),
                          d.get('AboutMe', pd.NA), d['CreationDate'], d['LastAccessDate']] for d in data['Users']])
users_df.to_csv("C:/temp/users.csv", sep = ",", index=False, header=None) # Convert to CSV

In [10]:
votes_df = pd.DataFrame([[d['Id'], d.get('UserId', pd.NA), d.get('PostId', pd.NA), d['VoteTypeId'],
                          d.get('BountyAmount', pd.NA), d['CreationDate']] for d in data['Votes']])
votes_df.to_csv("C:/temp/votes.csv", sep = ",", index=False, header=None) # Convert to CSV