In [1]:
# !rm -rf ../data/df_raw/*
# !gsutil -m cp -r gs://np-training-tmp/stackoverflow/posts_processed/*.parquet ../data/raw/

In [2]:
#!pip install plotly

In [3]:
#import plotly.express as px


In [4]:
from pyspark.sql import SparkSession
import re
import lxml.html
from pyspark.sql.types import StringType, ArrayType
import pyspark.sql.functions as F
import pandas as pd

In [5]:
spark = SparkSession\
.builder\
.appName('app')\
.config("spark.default.parallelism",200) \
.config("spark.sql.shuffle.partitions",200) \
.getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/10/23 16:16:42 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
22/10/23 16:16:42 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
22/10/23 16:16:42 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
22/10/23 16:16:42 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [6]:
posts_path = "gs://np-training-tmp/stackoverflow/output/posts__qn_with_top_answer/"
posts_link_path = "gs://np-training-tmp/stackoverflow/output/posts__links/"



In [7]:
df_posts = spark.read.parquet(posts_path)


                                                                                

In [8]:
df_posts.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+--------+----------------+--------------------+--------------------+--------------------+---------+-----------+------------+-----+--------------------+--------+------------------+
|      Id|AcceptedAnswerId|               Title|        QuestionBody|                Tags|ViewCount|AnswerCount|CommentCount|Score|        CreationDate|AnswerId|AcceptedAnswerBody|
+--------+----------------+--------------------+--------------------+--------------------+---------+-----------+------------+-----+--------------------+--------+------------------+
|24460667|            null|Gnumake rules wit...|<p>I'm looking fo...|<makefile><gnu-ma...|      679|          2|           3|    0|2014-06-27 20:56:...|    null|              null|
|70430813|            null|Multiple statuses...|<p>when i check t...|<formsflow.ai><fo...|       24|          1|           0|    1|2021-12-21 04:51:...|    null|              null|
|71029804|            null|How to mock Amazo...|<p>I'm trying to ...|<java><unit-testi...|     

                                                                                

In [9]:
df_posts_link = spark.read.parquet(posts_link_path)

                                                                                

In [10]:
df_posts_link.show(5)

+------------+--------+-----------+
|component_id|      Id|OtherPostId|
+------------+--------+-----------+
|       11857|48483506|   40872114|
|      111173|26220432|   13252603|
|       41872|24063986|      32332|
|      248533|31704180|    1803627|
|       49609|53596207|   53524127|
+------------+--------+-----------+
only showing top 5 rows



In [11]:
regex = r"""
	<pre>.*?</pre>
	"""

def clean_text(snippet:str):
    
    if not snippet:
        return snippet
    snippet = re.sub(pattern=regex, repl = '[CODE]', string = snippet,  flags = re.IGNORECASE | re.DOTALL | re.MULTILINE | re.VERBOSE )
    
    snippet = str(lxml.html.fromstring(snippet).text_content())
    
    return snippet

def parse_tags(content:str):
    return re.findall(r'<(.+?)>',content)


udf_clean_text = F.udf(lambda x:clean_text(x),StringType() )
udf_parse_tags = F.udf(lambda x:parse_tags(x),ArrayType (StringType() ))

In [12]:
clean_text(f"""

<p>I was asked to create a singleton that will..</p>.

<pre><code>KDF </code></pre>

<p> test </p>

""")

'I was asked to create a singleton that will...\n\n[CODE]\n\n test \n\n'

In [13]:
clean_text('')

''

In [14]:
clean_text(None)

In [15]:
df_posts_processed = df_posts \
                        .withColumn("QuestionBody",udf_clean_text(F.col("QuestionBody"))) \
                        .withColumn("AcceptedAnswerBody",udf_clean_text(F.col("AcceptedAnswerBody"))) \
                        .withColumn("Tags",udf_parse_tags(F.col("Tags"))) \
                        .cache()


In [16]:
df_posts_processed.where(F.col("AcceptedAnswerBody").isNotNull()).show(5)

[Stage 4:>                                                          (0 + 1) / 1]

+--------+----------------+--------------------+--------------------+--------------------+---------+-----------+------------+-----+--------------------+--------+--------------------+
|      Id|AcceptedAnswerId|               Title|        QuestionBody|                Tags|ViewCount|AnswerCount|CommentCount|Score|        CreationDate|AnswerId|  AcceptedAnswerBody|
+--------+----------------+--------------------+--------------------+--------------------+---------+-----------+------------+-----+--------------------+--------+--------------------+
| 3503488|         3608698|Why is Visual Stu...|this could be see...|[visual-studio-20...|      216|          1|           3|    1|2010-08-17 14:27:...| 3608698|The issue that re...|
|64419701|        64419983|Confused by R cod...|Chapt. 6.2 of Hyn...|                 [r]|       51|          1|           3|    0|2020-10-19 00:34:...|64419983|It's a MA of orde...|
|18900705|        18900858|Javascript case s...|I have a problem ...|[javascript, var

                                                                                

In [17]:
pdf_top_tags = df_posts_processed\
.where(F.col("AcceptedAnswerBody").isNotNull()) \
.select(F.explode("Tags").alias('tag')) \
.groupby(['tag']) \
.count() \
.toPandas() \
.sort_values(['count'], ascending=False)

                                                                                

In [18]:
pdf_top_tags.head(30)

Unnamed: 0,tag,count
24158,javascript,1264162
56543,python,1014931
49960,java,925828
12483,c#,865432
52019,php,748739
3332,android,617115
15506,html,610678
23882,jquery,594328
34774,c++,441122
24495,css,424461


In [19]:
pd.set_option("max_rows", None)


In [20]:
pdf_top_tags.head(500)

Unnamed: 0,tag,count
24158,javascript,1264162
56543,python,1014931
49960,java,925828
12483,c#,865432
52019,php,748739
3332,android,617115
15506,html,610678
23882,jquery,594328
34774,c++,441122
24495,css,424461


In [21]:
df_posts_link

DataFrame[component_id: bigint, Id: bigint, OtherPostId: bigint]

In [22]:
pdf_top_tags_links = df_posts_link \
.join(df_posts_processed, df_posts_link.Id == df_posts_processed.Id ) \
.select(F.explode("Tags").alias('tag')) \
.groupby(['tag']) \
.count() \
.toPandas() \
.sort_values(['count'], ascending=False)

                                                                                

In [23]:
pdf_top_tags_links.head(20)

Unnamed: 0,tag,count
10257,javascript,101869
24191,python,99128
21340,java,93097
22264,php,65189
5279,c#,55303
6580,html,44764
14785,c++,43231
1409,android,32606
10389,css,32405
10136,jquery,28289


In [24]:
tag_subset = [   'python','python-3.x','python-2.7','pandas'
                ,'pyspark','machine-learning','deep-learning'
                ,'scikit-learn' ,'tensorflow' ,'dataframe','r'
             ]

tag_subset_lit = F.array([F.lit(i) for i in tag_subset])

In [25]:
pdf_top_tags [ pdf_top_tags['tag'].isin(tag_subset) ]

Unnamed: 0,tag,count
56543,python,1014931
50849,r,255448
54981,python-3.x,157417
19104,pandas,149672
48159,dataframe,76190
54983,python-2.7,50598
13436,tensorflow,28618
8099,machine-learning,21499
32607,pyspark,14183
5116,scikit-learn,12053


In [26]:
#df_posts_processed_metada 

In [27]:
df_posts_final = df_posts_processed \
.where ( 
 F.size(F.array_intersect(F.col("Tags"), tag_subset_lit ))    > 0
) 



In [28]:
df_posts_final.show(5)

+--------+----------------+--------------------+--------------------+--------------------+---------+-----------+------------+-----+--------------------+--------+------------------+
|      Id|AcceptedAnswerId|               Title|        QuestionBody|                Tags|ViewCount|AnswerCount|CommentCount|Score|        CreationDate|AnswerId|AcceptedAnswerBody|
+--------+----------------+--------------------+--------------------+--------------------+---------+-----------+------------+-----+--------------------+--------+------------------+
|33760194|            null|Python How to bur...|I'm writing the p...|[python, event-ha...|      491|          0|           2|    0|2015-11-17 15:02:...|    null|              null|
|15020895|            null|Python int-byte e...|i am currently st...|[python, data-str...|      155|          0|           3|    1|2013-02-22 09:33:...|    null|              null|
|47234657|            null|converting word i...|def translate(str...|[python, python-3.x]|     

In [29]:
df_posts_final \
.toPandas() \
.to_parquet("gs://np-training-tmp/stackoverflow/final/posts.parquet", index=False)

                                                                                

In [30]:
df_posts_related =  df_posts_link \
.join(df_posts_processed.alias("A"), df_posts_link.Id == F.col("A.Id") ) \
.join(df_posts_processed.alias("B"), df_posts_link.OtherPostId == F.col("B.Id")) \
.selectExpr("component_id", "B.Id as PostId", "B.title as PostTitle", "A.title as RelatedPostTitle", "A.Id as RelatedPostId" , "B.Tags")  \
.where ( 
 F.size(F.array_intersect(F.col("Tags"), tag_subset_lit ))    > 0
) 

In [31]:
df_posts_related

DataFrame[component_id: bigint, PostId: bigint, PostTitle: string, RelatedPostTitle: string, RelatedPostId: bigint, Tags: array<string>]

In [32]:
df_posts_related.printSchema()

root
 |-- component_id: long (nullable = true)
 |-- PostId: long (nullable = true)
 |-- PostTitle: string (nullable = true)
 |-- RelatedPostTitle: string (nullable = true)
 |-- RelatedPostId: long (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [33]:
df_posts_related.show(5)



+------------+-------+--------------------+--------------------+-------------+--------------------+
|component_id| PostId|           PostTitle|    RelatedPostTitle|RelatedPostId|                Tags|
+------------+-------+--------------------+--------------------+-------------+--------------------+
|      350052| 116139|How can I search ...|Open and read eac...|     37130140|[python, ms-word,...|
|      170688| 447117|Django: Increment...|Count number of V...|     46152452|[python, database...|
|      262475| 472575|select single ite...|Python equivalent...|     46886903|[python, iterator...|
|       50426|1444961|Is there a good P...|Generating tokens...|      6843021|       [c++, python]|
|      227083|1506901|Cleanest and most...|Get tommorow's da...|     69364070|[python, datetime...|
+------------+-------+--------------------+--------------------+-------------+--------------------+
only showing top 5 rows



                                                                                

In [34]:
def flatten_related_post(records:pd.DataFrame):
    #print (type(records))
    
    res = { 'PostId' : records.iloc[0]['PostId'] 
           , 'PostTitle' : records.iloc[0]['PostTitle']
           , 'RelatedPostIds' : [records.iloc[0]['PostId']] + list(records['RelatedPostId'])
           , 'RelatedPostTitles' : [records.iloc[0]['PostTitle']] + list(records['RelatedPostTitle'])
           , 'num_candidates':len(records) + 1 ,
          }
    return pd.Series(res)

In [35]:
pdf_related_posts = df_posts_related \
.toPandas() \
.groupby(['component_id'], as_index=False).apply(flatten_related_post) \
.drop(columns=['component_id'])

                                                                                

In [36]:
pdf_related_posts.head(5)

Unnamed: 0,PostId,PostTitle,RelatedPostIds,RelatedPostTitles,num_candidates
0,57348742,How do I simulate a Scrollbar in tkInter Canvas,"[57348742, 68340045]",[How do I simulate a Scrollbar in tkInter Canv...,2
1,3494593,Shading a kernel density plot between two points.,"[3494593, 14863744, 14094644, 16504452, 488531...",[Shading a kernel density plot between two poi...,16
2,37949409,Dictionary in a numpy array?,"[37949409, 47689224, 61517741]","[Dictionary in a numpy array?, How to access t...",3
3,51519086,How to remove tkinter - - - - line's when crea...,"[51519086, 55088055]",[How to remove tkinter - - - - line's when cre...,2
4,63107594,How to deal with multi-level column names down...,"[63107594, 63107603, 62966295, 68674235, 63124...",[How to deal with multi-level column names dow...,6


In [37]:
pdf_related_posts['num_candidates'].value_counts()

2      23992
3       4641
4       1770
5        887
6        538
7        311
8        201
9        160
10       135
11        78
12        60
14        60
13        57
16        44
17        39
15        34
20        28
19        18
18        16
24        14
22        14
23        13
30        13
21        11
26        10
25        10
29         9
27         6
33         6
34         6
31         5
28         5
43         4
40         3
36         3
38         3
41         3
49         2
47         2
32         2
76         2
39         2
61         2
87         1
54         1
57         1
153        1
58         1
60         1
85         1
53         1
77         1
84         1
52         1
128        1
66         1
98         1
35         1
99         1
37         1
70         1
103        1
72         1
42         1
106        1
939        1
44         1
64         1
46         1
112        1
50         1
63         1
Name: num_candidates, dtype: int64

In [38]:
pdf_related_posts \
.to_parquet("gs://np-training-tmp/stackoverflow/final/related_posts.parquet", index=False)

In [39]:
# fig = px.histogram(pdf_related_posts, x="num_candidates")
# fig.show()

In [40]:
pdf_related_posts \
.to_parquet("gs://np-training-tmp/stackoverflow/final/related_posts.parquet", index=False)