In [1]:
from pyspark.sql import SparkSession
from file_parsers import parse_badges, parse_posts, parse_comments, parse_users,\
    parse_posthistory, parse_postlinks, parse_votes

In [2]:
spark = SparkSession.builder \
    .appName('StackExchange') \
    .master('local[*]') \
    .getOrCreate()
sc = spark.sparkContext

In [3]:
stack = '/home/piotr/big_data/archive.org/download/stackexchange/'
subject = 'ai.stackexchange.com/'
path = stack + subject

In [4]:
badges = parse_badges(sc, path + 'Badges.xml')
posts = parse_posts(sc, path + 'Posts.xml')
comments = parse_comments(sc, path + 'Comments.xml')
users = parse_users(sc, path + 'Users.xml')
posthistory = parse_posthistory(sc, path + 'PostHistory.xml')
postlinks = parse_postlinks(sc, path + 'PostLinks.xml')
votes = parse_votes(sc, path + 'Votes.xml')

In [5]:
badges.describe().show()

+-------+-----------------+--------------------+--------------------+
|summary|           UserId|                Name|                Date|
+-------+-----------------+--------------------+--------------------+
|  count|            19587|               19587|               19587|
|   mean|10619.38341757288|                null|                null|
| stddev|6829.751794077075|                null|                null|
|    min|              1.0|            Altruist|2016-08-02T15:38:...|
|    max|          22810.0|reinforcement-lea...|2019-03-03T02:45:...|
+-------+-----------------+--------------------+--------------------+



In [6]:
comments.describe().show()

+-------+-----------------+------------------+------------------+--------------------+--------------------+------+
|summary|               Id|            PostId|             Score|                Text|        CreationDate|UserID|
+-------+-----------------+------------------+------------------+--------------------+--------------------+------+
|  count|             9722|              9722|              9722|                9722|                9722|     0|
|   mean| 9168.57015017486|5912.5023657683605|0.3480765274634849|                null|                null|  null|
| stddev|4531.902413616199|2836.5003640471705|0.7432519890956761|                null|                null|  null|
|    min|              3.0|               1.0|               0.0|" Fortunately, we...|2016-08-02T15:44:...|  null|
|    max|          16566.0|           10968.0|              16.0|“But then with me...|2019-03-02T23:06:...|  null|
+-------+-----------------+------------------+------------------+---------------

In [7]:
users.describe().show()

+-------+------------------+------------------+--------------------+-----------+--------------------+--------------------+-----------------+-----------+------------------+------------------+-------------------+---------+----+
|summary|                Id|        Reputation|        CreationDate|DisplayName|      LastAccessDate|          WebsiteUrl|         Location|    AboutMe|             Views|           UpVotes|          DownVotes|EmailHash| Age|
+-------+------------------+------------------+--------------------+-----------+--------------------+--------------------+-----------------+-----------+------------------+------------------+-------------------+---------+----+
|  count|             20419|             20419|               20419|      20419|               20419|                4435|            10375|       7926|             20419|             20419|              20419|        0|   0|
|   mean|11855.315049708604| 43.11053430628336|                null|        NaN|                

In [8]:
posthistory.take(1)

[Row(Id=1.0, PostHistoryTypeId=2.0, PostId=1.0, RevisionGUID='acd11026-43b5-4640-a6dd-dfff1749f5b5', CreationDate='2016-08-02T15:39:14.947', UserId=8.0, UserDisplayName=None, Body=None, Comment=None, Text='What "backprop" does it mean? I\'ve Googled it, but it\'s showing backpropagation.\r\n\r\nIs "backprop" term is basically the same as backpropagation or it has a different meaning?', CloseReasonId=None)]

In [9]:
postlinks.describe().show()

+-------+------------------+--------------------+------------------+------------------+--------------+
|summary|                Id|        CreationDate|            PostId|     RelatedPostId|PostLinkTypeId|
+-------+------------------+--------------------+------------------+------------------+--------------+
|  count|               475|                 475|               475|               475|             0|
|   mean| 53244.56210526316|                null|  5789.34947368421| 4172.305263157895|          null|
| stddev|38291.787812747156|                null|3215.9652559378683|3005.1205170890617|          null|
|    min|             103.0|2016-08-02T19:22:...|              37.0|              10.0|          null|
|    max|          114055.0|2019-02-28T01:18:...|           10922.0|           10644.0|          null|
+-------+------------------+--------------------+------------------+------------------+--------------+



In [10]:
votes.describe().show()

+-------+------------------+-----------------+------------------+--------------------+-----------------+-----------------+
|summary|                Id|           PostId|        VoteTypeId|        CreationDate|           UserId|     BountyAmount|
+-------+------------------+-----------------+------------------+--------------------+-----------------+-----------------+
|  count|             31258|            31258|             31258|               31258|             2599|               75|
|   mean|17737.286006782262|4716.302994433425|3.5600806193614436|                null|9492.641015775298|67.66666666666667|
| stddev| 9638.090733972806|2957.759421723209| 3.742220256947708|                null|6627.252377994014|62.30013086909454|
|    min|               1.0|              1.0|               1.0|2016-08-02T00:00:...|             -1.0|             25.0|
|    max|           34329.0|          10973.0|              16.0|2019-03-02T00:00:...|          22782.0|            400.0|
+-------+-------