In [3]:
from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [4]:
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
!tar xf spark-2.4.8-bin-hadoop2.7.tgz
!pip install -q findspark
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-3.0.0-py3-none-any.whl (8.5 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-3.0.0


In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext(appName="YourTest", master="local[*]")
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [6]:
import jsonlines
from pyspark.sql.functions import explode,col
from pyspark.sql import functions as F
from pyspark.sql.functions import concat_ws
import pandas as pd

In [7]:
# Data Path
data_path ="/content/drive/My Drive/UWaterloo/Fall 21/CS 631/Project/sample.jsonl"
data = sqlContext.read.json(data_path).sort("year",ascending=False)
data.count()

50000

In [77]:
# Main Tables Used for visuals

# Main authors data
authors_data = data.select("id",explode("authors").alias("authors"))
authors_data = authors_data.select(authors_data.id.alias("paper_id"), \
    F.col("authors").getItem("name").alias("name"), \
    F.col("authors").getItem("country").alias("country"), \
    F.col("authors").getItem("id").alias("author_id"), \
    F.col("authors").getItem("affiliation").alias("affiliation"), \
    F.col("authors").getItem("affiliationid").alias("affiliationid"), \
    F.col("authors").getItem("order").alias("order"))

# Topics per Inst
inst_topics = authors_data.join(data.select("id","topics"),authors_data.paper_id==data.id,"left")\
            .select("affiliationid",explode("topics").alias("topics"))\
            .groupBy(["affiliationid","topics"]).count()

# Author Stats
# Review g-index and fix h-index
from pyspark.sql.window import Window
windowSpec  = Window.partitionBy("author_id").orderBy(col("citationcount").desc())
authors_stats = authors_data.join(data.select("id","citationcount","topics"),authors_data.paper_id==data.id,"left")\
                            .withColumn("h_index",F.when(F.dense_rank().over(windowSpec)<=col("citationcount"),1).otherwise(0))\
                            .groupBy("author_id")\
                            .agg(F.first("name").alias("name"),\
                                F.first("country").alias("country"),\
                                F.first("affiliation").alias("last_affiliation"),\
                                F.first("affiliationid").alias("affiliationid"),\
                                F.count("paper_id").alias("papers_count"),\
                                F.avg("order").alias("avg_order"),\
                                F.avg("citationcount").alias("avg_citations"),\
                                F.sum("citationcount").alias("total_citations"),\
                                F.sum("h_index").alias("h_index"),\
                                F.first("paper_id").alias("last_paper_id"),\
                                F.collect_set("topics").alias("all_topics"))\
                            .sort("h_index",ascending=False)

# Inst Stats
inst_stats = authors_data.groupBy("affiliationid")\
                         .agg(F.first("affiliation").alias("name"),\
                            F.first("country").alias("country"),
                            F.countDistinct("paper_id").alias("papers_count"),\
                            F.countDistinct("author_id").alias("authors_count"))
                          

# Coauthors
coinst_data = authors_data.select("paper_id","author_id","affiliation","affiliationid")\
                              .join(data.select("id",explode("authors").alias("authors")),authors_data.paper_id==data.id,"left")\
                              .select("paper_id", "affiliation","affiliationid",\
                                      F.col("authors").getItem("affiliation").alias("co_inst"))\
                              .groupBy(["affiliationid","affiliation","co_inst"]).agg(F.countDistinct("paper_id").alias("co_inst_count"))

In [78]:
# Filters
# List of Inst. to Select From
Inst = authors_data.select("affiliationid","affiliation").distinct().toPandas()

# Selected Author ID
selected_inst = 151746483 #uwaterloo

In [79]:
# Plot Libraries
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go



In [80]:
inst_stats.filter(col("affiliationid")==selected_inst)\
                .show()


+-------------+--------------------+-------+------------+-------------+
|affiliationid|                name|country|papers_count|authors_count|
+-------------+--------------------+-------+------------+-------------+
|    151746483|University of Wat...| Canada|         113|          197|
+-------------+--------------------+-------+------------+-------------+



In [81]:
def top_20_topics(selected_inst):
  topics = inst_topics.filter(col("affiliationid")==selected_inst)\
                .select("topics","count")\
                .sort("count",ascending=False).limit(20).toPandas()

  fig = go.Figure(data=[go.Table(
      header=dict(values=["Topic","Number of Papers"],
                  fill_color='paleturquoise',
                  align='left'),
      cells=dict(values=[topics["topics"],topics["count"]],
                fill_color='lavender',
                align='left'))
                ])
  fig.show()

def top_authors_table(selected_inst):
  top_authors = authors_stats.select("name","papers_count","h_index","avg_citations","total_citations","all_topics")\
                            .filter(col("affiliationid")==selected_inst)\
                            .limit(30).toPandas()

  top_authors.all_topics = top_authors.all_topics.apply(lambda lst: list(set(x for l in lst for x in l)))#lambda y: list(set(sum(y,[]))))

  fig = go.Figure(data=[go.Table(
      header=dict(values=["Name","Number of Papers","H Index","Avg. Citations","Total Citations Count","Topics"],
                  fill_color='paleturquoise',
                  align='left'),
      cells=dict(values=[top_authors.name,top_authors.papers_count,top_authors.h_index,round(top_authors.avg_citations,2),\
                        top_authors.total_citations,top_authors.all_topics.apply(lambda x: ', '.join(map(str, x)))],
                fill_color='lavender',
                align='left'))
  ])

  fig.show()
  
def coinst_pie(selected_inst,top_n=8):
  co_insts = coinst_data.filter(col("affiliationid")==selected_inst)\
                        .select("co_inst","co_inst_count")\
                        .sort("co_inst_count",ascending=False)\
                        .toPandas()[1:]
  co_insts = co_insts.dropna()
  df2 = co_insts[:top_n].copy()
  new_row = pd.DataFrame(data = {
      'co_inst' : ['others'],
      'co_inst_count' : [co_insts['co_inst_count'][top_n:].sum()]
  })

  co_insts = pd.concat([df2, new_row])
  co_insts.rename({'co_inst': 'Co-Affiliation', 'co_inst_count': 'Number of Papers'}, axis=1, inplace=True)

  fig = px.pie(co_insts, values="Number of Papers", names='Co-Affiliation', color='Co-Affiliation')
  fig.update_traces(textposition='inside', textinfo='label',hoverinfo='label+percent+value')
  # fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
  fig.show()


In [82]:
# Tests
top_authors_table(selected_inst)
top_20_topics(selected_inst)
coinst_pie(selected_inst)