In [None]:
import re
import ast
import pyspark
import time
import datetime
import html
from pyspark import SparkContext
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp

In [None]:
sc = SparkContext()
sqlContext = SQLContext(sc)

In [None]:
def attribute_search(attribute, string):
    result = re.search(attribute+'=\"(.*?)\"', string)
    if result:
        return result.group(1).replace('"', '')
    else:
        return None

In [None]:
def tags_from_xml(line):
    c = line.replace('<row', '').replace('/>', '')
    row = dict()
    row['Id'] = int(attribute_search('Id', c));
    row['TagName'] = attribute_search('TagName', c);
    count = attribute_search('Count', c);    
    row['Count'] = int(count) if count else None;

    return pyspark.Row(**row)

In [None]:
def badges_from_xml(line):
    c = line.replace('<row', '').replace('/>', '')
    row = dict()
    row['Id'] = int(attribute_search('Id', c));
    row['UserId'] = int(attribute_search('UserId', c));
    row['Name'] = attribute_search('Name', c);
    row['Date'] = datetime.datetime.strptime(attribute_search('Date', c), "%Y-%m-%dT%H:%M:%S.%f");
    row['Class'] = int(attribute_search('Class', c));
    row['TagBased'] = ast.literal_eval(attribute_search('TagBased', c));
    return pyspark.Row(**row)

In [None]:
def users_from_xml(line):
    c = line.replace('<row', '').replace('/>', '')
    row = dict()
    row['Id'] = int(attribute_search('Id', c));
    row['Reputation'] = int(attribute_search('Reputation', c));
    row['CreationDate'] = datetime.datetime.strptime(attribute_search('CreationDate', c), "%Y-%m-%dT%H:%M:%S.%f");
    row['DisplayName'] = attribute_search('DisplayName', c);
    row['LastAccessDate'] = datetime.datetime.strptime(attribute_search('LastAccessDate', c), "%Y-%m-%dT%H:%M:%S.%f");
    row['WebsiteUrl'] = attribute_search('WebsiteUrl', c);
    row['Location'] = attribute_search('Location', c);
    age = attribute_search('Age', c);
    row['Age'] = int(age) if age else None;
    row['Views'] = int(attribute_search('Views', c));
    row['UpVotes'] = int(attribute_search('UpVotes', c));
    row['DownVotes'] = int(attribute_search('DownVotes', c));    
    return pyspark.Row(**row)

In [None]:
def posts_from_xml(line):
    c = line.replace('<row', '').replace('/>', '')
    row = dict()
    row['Id'] = int(attribute_search('Id', c));

    row['PostTypeId'] = int(attribute_search('PostTypeId', c));
    
    parent = attribute_search('ParentId', c);
    row['ParentId'] = int(parent) if parent else None;    
    row['CreationDate'] = datetime.datetime.strptime(attribute_search('CreationDate', c), "%Y-%m-%dT%H:%M:%S.%f");
    row['Score'] = int(attribute_search('Score', c));
    
    vc = attribute_search('ViewCount', c);
    row['ViewCount'] = int(vc) if vc else None;
        
    row['Body'] = re.sub('(<!--.*?-->|<[^>]*>)', '', html.unescape(attribute_search('Body', c)));

    title = attribute_search('Title', c);
    row['Title'] = title if title else None;
    
    tags = attribute_search('Tags', c);
    row['Tags'] = html.unescape(tags).replace('<', '').replace('>', ' ') if tags else None;
    
    owner = attribute_search('OwnerUserId', c);
    row['OwnerUserId'] = int(owner) if owner else None;

    count = attribute_search('AnswerCount', c);
    row['AnswerCount'] = int(count) if count else None;
    count = attribute_search('CommentCount', c);    
    row['CommentCount'] = int(count) if count else None;
    count = attribute_search('FavoriteCount', c);
    row['FavoriteCount'] = int(count) if count else None;
        
    return pyspark.Row(**row)

In [None]:
def comments_from_xml(line):
    c = line.replace('<row', '').replace('/>', '')
    row = dict()    
    row['Id'] = int(attribute_search('Id', c));
    row['PostId'] = int(attribute_search('PostId', c));
    row['Score'] = int(attribute_search('Score', c));
    row['Text'] = re.sub('(<!--.*?-->|<[^>]*>)', '', html.unescape(attribute_search('Text', c)));    
    row['CreationDate'] = datetime.datetime.strptime(attribute_search('CreationDate', c), "%Y-%m-%dT%H:%M:%S.%f");
    user = attribute_search('UserId', c);
    row['UserId'] = int(user) if user else None;
    return pyspark.Row(**row)

In [None]:
users_rdd = sc.textFile('file:///home/marek/Dokumenty/Notebooks/gis_stack_spark/data/Users.xml') \
               .filter(lambda line: "row" in line) \
               .map(lambda l: users_from_xml(l))

print(users_rdd.take(3))
users = sqlContext.createDataFrame(users_rdd)
users.printSchema()
users.show()

In [None]:
badges_rdd = sc.textFile('file:///home/marek/Dokumenty/Notebooks/gis_stack_spark/data/Badges.xml') \
               .filter(lambda line: "row" in line) \
               .map(lambda l: badges_from_xml(l))

print(badges_rdd.take(1))
badges = sqlContext.createDataFrame(badges_rdd)
badges.printSchema()
badges.show()

In [None]:
posts_rdd = sc.textFile('file:///home/marek/Dokumenty/Notebooks/gis_stack_spark/data/Posts.xml') \
              .filter(lambda line: "row" in line) \
              .map(lambda l: posts_from_xml(l))

print(posts_rdd.take(3))
posts = sqlContext.createDataFrame(posts_rdd)
posts.printSchema()
posts.show()
posts.select('Tags').show()

In [None]:
tags_rdd = sc.textFile('file:///home/marek/Dokumenty/Notebooks/gis_stack_spark/data/Tags.xml') \
               .filter(lambda line: "row" in line) \
               .map(lambda l: tags_from_xml(l))

print(tags_rdd.take(3))
tags = sqlContext.createDataFrame(tags_rdd)
tags.printSchema()
tags.show(100)

In [None]:
comments_rdd = sc.textFile('file:///home/marek/Dokumenty/Notebooks/gis_stack_spark/data/Comments.xml') \
               .filter(lambda line: "row" in line) \
               .map(lambda l: comments_from_xml(l))

print(comments_rdd.take(3))
comments = sqlContext.createDataFrame(comments_rdd)
comments.printSchema()
comments.show(100)

In [None]:
re.search(r'Name=\"(.*?)\"', 'Id="1" UserId="2" Name="Autobiographer" Date=2010-07-22T18:58:27.867 Class=3 TagBased=False').group(1)

In [None]:
badges_lines = sc.textFile('file:///home/marek/Dokumenty/Notebooks/gis_stack_spark/data/Badges.xml')

In [None]:
badges_rdd = badges_lines.filter(lambda line: "row" in line) \
             .map(lambda x: x.strip().replace('"', '').split(" ")) \
             .map(lambda x: (int(x[1].split('=')[1]), int(x[2].split('=')[1]), \
                             x[3].split('=')[1], \
                             datetime.datetime.strptime(x[4].split('=')[1], "%Y-%m-%dT%H:%M:%S.%f"), \
                             int(x[5].split('=')[1]), ast.literal_eval(x[6].split('=')[1]) \
                             ))

In [None]:
badges_rdd.take(8)

In [None]:
sqlContext.createDataFrame(badges_rdd, badgesSchema).collect()

In [None]:
sqlContext.createDataFrame(badges_rdd, badgesSchema).printSchema()

In [None]:
badges_rdd = badges_lines.filter(lambda line: "row" in line) \
             .map(lambda x: x.replace('<row', '').replace('/>', '').replace('"', '').strip())

In [None]:
badges_rdd.take(3)