#### Names of people in the group

Please write the names of the people in your group in the next cell.

Marcus Stephan Nordal

Silviu Mihai

In [None]:
# Loading modules that we need
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Add your imports below this line
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, countDistinct, regexp_replace, explode, split

In [None]:
# A helper function to load a table (stored in Parquet format) from DBFS as a Spark DataFrame 
def load_df(table_name: "name of the table to load") -> DataFrame:
    return spark.read.parquet(table_name)

users_df = load_df("/user/hive/warehouse/users")
posts_df = load_df("/user/hive/warehouse/posts")

# Uncomment if you need
# comments_df = load_df("/user/hive/warehouse/comments")
# badges_df = load_df("/user/hive/warehouse/badges")

#### The Problem: Mining the Interests of Experts

In [None]:
## To-do!

users_df = load_df("/user/hive/warehouse/users")
posts_df = load_df("/user/hive/warehouse/posts")

users_df = users_df.withColumnRenamed('Id', 'UserId')

# expert answers
expert_answers_df = posts_df.alias("p").join(
    users_df.alias("u"),
    (col("p.OwnerUserId") == col("u.UserId")) & (col("p.PostTypeId") ==  2)
)

# variable a - experts' reputation
experts_reputation_df = expert_answers_df.select("u.UserId", col("Reputation"))

experts_reputation_df.show()

# join with questions to find tags
expert_answers_questions_df = expert_answers_df.alias("ap").join(
    posts_df.alias("qp"),
    (col("ap.ParentId") == col("qp.Id"))
)

# select expert id and the tags
tags_df = expert_answers_questions_df.select(col("ap.Id"), col("qp.Tags").alias("Tags"))


# process the tags 
tags_df = tags_df.withColumn("Tags", regexp_replace("Tags", "<(.*?)>", "$1,"))    
tags_df = tags_df.withColumn("Tags", explode(split(col("Tags"), ",")))
tags_df = tags_df.filter(col("Tags").isNotNull() & (col("Tags") != ""))

# total number of distinct tags for each user
user_interests_df = tags_df.groupBy("Id").agg(countDistinct("Tags").alias("DistinctTags"))
total_unique_tags = 638

# variable b - diversity of user's interests
user_interests_df = user_interests_df.withColumn("InterestDiversity", col("DistinctTags") / total_unique_tags)

# join the reputation and interest diversity dataframes and remove duplicate rows
joined_df = experts_reputation_df.join(user_interests_df, experts_reputation_df.UserId == user_interests_df.Id)
joined_df = joined_df.distinct()

joined_df.show()

# vector assembler for the pearson correlation
va = VectorAssembler(inputCols=["Reputation", "InterestDiversity"], outputCol="features")
data = va.transform(joined_df).select("features")

correlation_matrix = Correlation.corr(data, "features").collect()[0][0]
pearson_correlation = correlation_matrix[1, 0]

print("Pearson Correlation Coefficient:", pearson_correlation)
# We get a pearson correlation coefficient of 0.0065856

Do expert users have specific interests, or do they have general interests?

From the correlation coefficient of 0.006585616084188508 we can assume that 
the correlation between the experts' reputation and their interest diversity is slim to none, 
since a pearson correlation of 0 means that there is no correlation, we did not find a significant 
negative or positive sway. However, there is a slight positive correlation, but in general 
the experts with the most reputation are not the ones with the most or the least diverse interests. 
The experts' interest diversity does not have much of an impact on their reputation either, 
except for a very small positive correlation.