In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from file_parsers import parse_badges, parse_posts, parse_comments, parse_users,\
    parse_posthistory, parse_postlinks, parse_votes, parse_tags

In [2]:
spark = SparkSession.builder \
    .appName('StackExchange') \
    .master('local[*]') \
    .getOrCreate()
sc = spark.sparkContext

# Data Loading

In [4]:
stack = '/home/piotr/big_data/archive.org/download/stackexchange/'
subject = 'gardening.stackexchange.com/'
path = stack + subject

In [5]:
badges = parse_badges(sc, path + 'Badges.xml')
posts = parse_posts(sc, path + 'Posts.xml')
comments = parse_comments(sc, path + 'Comments.xml')
users = parse_users(sc, path + 'Users.xml')
posthistory = parse_posthistory(sc, path + 'PostHistory.xml')
postlinks = parse_postlinks(sc, path + 'PostLinks.xml')
votes = parse_votes(sc, path + 'Votes.xml')
tags = parse_tags(sc, path + 'Tags.xml')

In [None]:
posts.toPandas().info()

In [None]:
comments.toPandas().info()

In [None]:
users.toPandas().info()

In [None]:
posthistory.toPandas().info()

In [None]:
postlinks.toPandas().info()

In [None]:
votes.toPandas().info()

In [None]:
badges.groupBy('Name').count().count()#sort('count', ascending=False)

In [None]:
nice_answers = badges.filter(col('Name')=='Nice Answer')

In [None]:
#nice_answers.groupBy('UserId').count().sort('count', ascending=False).show(20)
#users.select(col('DownVotes')).show(20)
#posthistory.printSchema()
comments.show(3)

# Data Exploration

## Badges

In [6]:
tags.take(2)

[Row(Id=1.0, TagName='houseplants', Count=802.0, ExcerptPostId=470.0, WikiPostId=469.0),
 Row(Id=7.0, TagName='rhubarb', Count=14.0, ExcerptPostId=2344.0, WikiPostId=2343.0)]

In [9]:
tags.sort('Count', ascending=False).show()

+-----+--------------------+------+-------------+----------+
|   Id|             TagName| Count|ExcerptPostId|WikiPostId|
+-----+--------------------+------+-------------+----------+
| 96.0|      identification|2092.0|        251.0|     250.0|
| 81.0|               trees|1071.0|        763.0|     762.0|
|207.0|           diagnosis| 835.0|        410.0|     409.0|
|  1.0|         houseplants| 802.0|        470.0|     469.0|
|  8.0|          vegetables| 589.0|        468.0|     467.0|
|196.0|        plant-health| 572.0|       2202.0|    2201.0|
|124.0|         fruit-trees| 531.0|        491.0|     490.0|
|134.0|             flowers| 475.0|        811.0|     810.0|
| 13.0|                lawn| 456.0|        765.0|     764.0|
|147.0|             indoors| 419.0|        782.0|     781.0|
| 22.0|            watering| 410.0|        702.0|     701.0|
|108.0|            diseases| 397.0|       1049.0|    1048.0|
|267.0|              leaves| 390.0|       2209.0|    2208.0|
| 11.0|                s

In [None]:
import matplotlib
%matplotlib inline

In [None]:
posts_time = posts.select([col('CreationDate').cast('date'), col('AnswerCount'), col('CommentCount')])

In [None]:
posts_time.take(2)

In [None]:
from pyspark.sql.functions import month, year
bitcoin_popularity = posts_time.groupBy(year('CreationDate')).count().toPandas()

In [None]:
bitcoin_popularity.cumsum().plot()