# Task 4

In [0]:
# Loading modules that we need
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from typing import Any
from pyspark.sql.functions import corr
from pyspark.sql import functions as F


#### Connection to Local spark Cluster

In [None]:
# Connect to the Spark Master running in Docker
spark = SparkSession.builder \
    .appName("MyLocalNotebook4") \
    .master("spark://localhost:7077") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

# Verify connection
print(spark.version)
print("Spark is running on", spark.sparkContext.master)

In [0]:
# A helper function to load a table (stored in Parquet format) from DBFS as a Spark DataFrame 
def load_df(table_name: "name of the table to load") -> DataFrame:
    return spark.read.format("delta").load(table_name)

users_df = load_df("users")
posts_df = load_df("posts")

# Uncomment if you need
# comments_df = load_df("comments")
# badges_df = load_df("badges")

#### The problem: mining the interests of experts

The primary role of a questions and answering platform such as Stack Exchange is to connect two types of people. Namely, people who have questions in areas such as computer science or data science and knowledgeable people who can answer those questions reliably. Let's call the first category of people' knowledge seekers' and the second one 'expert users' or 'experts' for short.

Here we want to answer a question related to the diversity of topics that experts are interested in using our data. We want to know if expert users only answer questions in a specific set of topics or their interests include a wide variety of topics.

To answer the above question, we will compute the correlation between a user's expertise level and the diversity of topics of questions they have answered. The first step is to define two variables (or measures); first for 'user expertise level' and then for 'user interest diversity'. Then we will use the Pearson correlation coefficient to measure the linear correlation between the two variables. We define the variables as:

   - VariableA (the measure of user expertise level). We will use the 'Reputation' column from 'users' table, which according to Stack Exchange's documentation "is a rough measurement of how much the community trusts you; it is earned by convincing your peers that you know what you're talking about" as an indicator of a user's expertise level on the platform. 

   - VariableB (The measure of user interest diversity). We measure the diversity of a user's interests by computing the total number of distinct tags associated with the questions each user has answered divided by the total number of unique tags which is 638.

Compute the Pearson correlation coefficient between VariableA and VariableB, and based on the result you've got, answer the following question: 

     Do expert users have specif interests or do they have general interests?

Please explain your reasoning on how you reached your answer.

You should use Apache Spark API for your implementation. You can use the Spark implementation of the Pearson correlation coefficient.

In [0]:

df_varA = users_df.select("Id", "Reputation")
df_varA.show()





+---+----------+
| Id|Reputation|
+---+----------+
| -1|         1|
|  1|       101|
|  2|       101|
|  3|       101|
|  4|       101|
|  5|       215|
|  6|       101|
|  7|       101|
|  8|       101|
|  9|      1102|
| 10|       101|
| 11|       213|
| 12|       101|
| 14|      2782|
| 15|       101|
| 16|         1|
| 17|       236|
| 18|       101|
| 19|       101|
| 20|       101|
+---+----------+
only showing top 20 rows



In [0]:

# Find the number of distinct tags associated with a question each user has answered
# divide this number by fetch number of unique tags
def run_query2(query: "a SQL query string", df1: "DataFrame A", df2: "DataFrame B") -> Any:
    df1.createOrReplaceTempView("df1")
    df2.createOrReplaceTempView("df2")
    result = spark.sql(query)
    return result



q1 = "SELECT df2.Id, df1.Tags, df2.OwnerUserId FROM df1 INNER JOIN df2 ON df1.Id = df2.ParentId GROUP BY df1.Tags, df2.Id, df2.OwnerUserId ORDER BY df2.Id ASC"
unique_nb_tags = 638

count_splits = udf(lambda x : len(x) - 1, IntegerType())
VarB = udf(lambda x : x / 638, DoubleType())

result = run_query2(q1, posts_df, posts_df)
result.display()
df_tags = result

df_tags = df_tags.select('*', split("Tags", ">").alias("splits"))
df_tags = df_tags.withColumn("Splits", count_splits(df_tags["splits"]))
df_tags = df_tags.select("OwnerUserId", "Splits")
df_varB = df_tags.withColumn("Splits", VarB(df_tags["splits"]))
df_varB = df_varB.withColumnRenamed("OwnerUserId", "Id")
df_varB = df_varB.filter(df_varB["Id"].isNotNull())
df_varB = df_varB.groupBy("Id").sum("Splits")
df_varB.display()







Id,Tags,OwnerUserId
9,,51.0
10,,22.0
21,,14.0
23,,97.0
24,,14.0
25,,104.0
26,,115.0
27,,108.0
28,,118.0
29,,53.0


Id,sum(Splits)
463,0.0047021943573667
471,0.195924764890282
11141,0.0313479623824451
13285,0.0329153605015673
14450,0.0094043887147335
15790,0.0172413793103448
9465,0.0235109717868338
27760,0.012539184952978
29054,0.006269592476489
29744,0.0031347962382445


In [0]:
df_varA
df_varB
merged_df = df_varA.join(df_varB, on=["Id"])
merged_df.display()

correlation = merged_df.corr("Reputation", "sum(Splits)", "pearson")
print(correlation)

Id,Reputation,sum(Splits)
463,71,0.0047021943573667
471,1842,0.195924764890282
11141,364,0.0313479623824451
13285,188,0.0329153605015673
14450,131,0.0094043887147335
15790,139,0.0172413793103448
9465,1938,0.0235109717868338
27760,11,0.012539184952978
29054,96,0.006269592476489
29744,26,0.0031347962382445


0.7724926022809913



I first selected the reputation of each user with their user_id. Then I joined the Posts to find all the Tags for answers by Users. So I selected all the tags for each question and for each user. I calculated the amount of tags in one question divided by the amount of tags in total (638). Then I grouped all the users together and added the user interest diversity. Then I merged VarA and VarB on their respective user_ids. Then I had a table with Id, Reputation and their user interest diversity. Then I calculated the pearson coefficient on their reputation and their user interest diversity. We found a pearson coefficient of 0.77249 which means there is a high corellation between user interest diversity and their reputation. This means our users with high reputation have a more general interest then specific interests.

