In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/home/hduser/anaconda3/bin/python'

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from file_parsers import parse_badges, parse_posts, parse_comments, parse_users,\
    parse_posthistory, parse_postlinks, parse_votes, parse_tags

In [3]:
spark = SparkSession.builder \
    .appName('StackExchange') \
    .master('yarn-client') \
    .getOrCreate()
sc = spark.sparkContext
sc.addPyFile('parser_utils.py')

# Data Loading

In [4]:
# stack = '/home/piotr/big_data/archive.org/download/stackexchange/'
# #subject = 'meta.stackoverflow.com/'
# subject = 'gardening.stackexchange.com/'
path = 'StackOverflowDump/'#stack + subject

In [5]:
badges = parse_badges(sc, path + 'Badges.xml')
posts = parse_posts(sc, path + 'Posts.xml')
comments = parse_comments(sc, path + 'Comments.xml')
users = parse_users(sc, path + 'Users.xml')
posthistory = parse_posthistory(sc, path + 'PostHistory.xml')
postlinks = parse_postlinks(sc, path + 'PostLinks.xml')
votes = parse_votes(sc, path + 'Votes.xml')
tags = parse_tags(sc, path + 'Tags.xml')

# Posts

In [6]:
posts.printSchema()

root
 |-- Id: float (nullable = true)
 |-- PostTypeId: float (nullable = true)
 |-- ParentId: float (nullable = true)
 |-- AcceptedAnswerId: float (nullable = true)
 |-- CreationDate: string (nullable = true)
 |-- Score: float (nullable = true)
 |-- ViewCount: float (nullable = true)
 |-- Body: string (nullable = true)
 |-- OwnerUserId: float (nullable = true)
 |-- LastEditorUserId: float (nullable = true)
 |-- LastEditorDisplayName: string (nullable = true)
 |-- LastEditDate: string (nullable = true)
 |-- LastActivityDate: string (nullable = true)
 |-- CommunityOwnedDate: string (nullable = true)
 |-- ClosedDate: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Tags: string (nullable = true)
 |-- AnswerCount: float (nullable = true)
 |-- CommentCount: float (nullable = true)
 |-- FavoriteCount: float (nullable = true)



In [7]:
posts_filtered = posts.select(col('Id').cast('integer'), \
                             col('PostTypeId').cast('integer'), \
                             col('ParentId').cast('integer'), \
                             col('CreationDate').cast('timestamp'),
                             col('AnswerCount').cast('integer'))

In [8]:
posts_filtered.head(3)

[Row(Id=1, PostTypeId=1, ParentId=None, CreationDate=datetime.datetime(2013, 9, 24, 19, 4, 6, 940000), AnswerCount=3),
 Row(Id=2, PostTypeId=1, ParentId=None, CreationDate=datetime.datetime(2013, 9, 24, 19, 5, 22, 893000), AnswerCount=3),
 Row(Id=3, PostTypeId=2, ParentId=2, CreationDate=datetime.datetime(2013, 9, 24, 19, 13, 39, 430000), AnswerCount=None)]

In [9]:
posts_filtered.groupBy('PostTypeId').count().show()

+----------+-----+
|PostTypeId|count|
+----------+-----+
|         1|  179|
|         5|    1|
|         4|    1|
|         2|  254|
+----------+-----+



## ile pytań bez odpowiedzi

In [10]:
posts_filtered.filter(col('AnswerCount')==0).count()

30

## rozkład czasu pierwszej odpowiedzi

In [11]:
questions = posts_filtered.filter(col('PostTypeId')==1)
answers = posts_filtered.filter(col('PostTypeId')==2)

In [12]:
questions.show(5)

+---+----------+--------+--------------------+-----------+
| Id|PostTypeId|ParentId|        CreationDate|AnswerCount|
+---+----------+--------+--------------------+-----------+
|  1|         1|    null|2013-09-24 19:04:...|          3|
|  2|         1|    null|2013-09-24 19:05:...|          3|
|  6|         1|    null|2013-09-24 19:46:...|          2|
| 10|         1|    null|2013-09-24 19:56:...|          7|
| 13|         1|    null|2013-09-24 20:22:...|          7|
+---+----------+--------+--------------------+-----------+
only showing top 5 rows



In [13]:
answers.show(5)

+---+----------+--------+--------------------+-----------+
| Id|PostTypeId|ParentId|        CreationDate|AnswerCount|
+---+----------+--------+--------------------+-----------+
|  3|         2|       2|2013-09-24 19:13:...|       null|
|  4|         2|       1|2013-09-24 19:33:...|       null|
|  5|         2|       1|2013-09-24 19:42:...|       null|
|  7|         2|       2|2013-09-24 19:48:...|       null|
|  8|         2|       1|2013-09-24 19:53:...|       null|
+---+----------+--------+--------------------+-----------+
only showing top 5 rows



In [None]:
from pyspark.sql import functions as F
first_answers = answers.groupBy('ParentId').agg(F.min(answers.CreationDate))
first_answers.show(5)

In [None]:
#cos aliasami
from pyspark.sql.functions import unix_timestamp
df_as1 = questions.alias("questions")
df_as2 = first_answers.alias("first_answers")
joined = df_as1.join(df_as2, col("questions.Id") == col("first_answers.ParentId"), 'inner')
questions_time = joined.select(col('Id'), col('CreationDate'), col('min(CreationDate)'))
diff = questions_time.withColumn('diff', (unix_timestamp(col('min(CreationDate)'))-unix_timestamp(col('CreationDate')))/60)
first_answer_time = diff.select(col('Id'), col('diff'))
first_answer_time_pandas = first_answer_time.toPandas()

In [None]:
first_answer_time_pandas[first_answer_time_pandas['diff']<1]['diff']

In [None]:
#rozklad czasu od pytania do pierwszej odpowiedzi (ograniczony do pierwszej doby)
sns.boxplot(first_answer_time_pandas[first_answer_time_pandas['diff']<60*24]['diff'])


In [None]:
#df_as1.join(df_as2, df_as1['Id'] == df_as2['ParentId'], 'inner').show()

In [None]:
df_as2

# Data Exploration

## Badges

In [None]:
tags.take(2)

In [None]:
tags.sort('Count', ascending=False).show()

In [None]:
import matplotlib
%matplotlib inline

In [None]:
posts_time = posts.select([col('CreationDate').cast('date'), col('AnswerCount'), col('CommentCount')])

In [None]:
posts_time.take(2)

In [None]:
from pyspark.sql.functions import month, year
bitcoin_popularity = posts_time.groupBy(year('CreationDate')).count().toPandas()

In [None]:
bitcoin_popularity.cumsum().plot()