In [5]:
%load_ext autoreload
%autoreload 2 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import findspark
findspark.init()

In [3]:
from random import random
from operator import add
from pyspark.sql import SparkSession
import pandas as pd


DATA_DIR = '/home/bob/gtd/Iconic/data/samples/'

In [4]:
spark = SparkSession\
    .builder\
    .appName("IconicSparkPy")\
    .getOrCreate()

# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [180]:
def read_dataframe(path):
    return spark.read.csv(DATA_DIR + path, sep = '\t', header = True, inferSchema = True)

papers = read_dataframe('Papers.txt')
authors = read_dataframe('Authors.txt')
fos = read_dataframe('FieldsOfStudy.txt')
affs = read_dataframe('Affiliations.txt')
paa = read_dataframe('PaperAuthorAffiliations.txt')
pf = read_dataframe('PaperFieldsOfStudy.txt')
pr = read_dataframe('PaperReferences.txt')
fc = read_dataframe('FieldOfStudyChildren.txt')

# Preliminaries
## Get root field of study 

In [149]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col

In [150]:
def computeRootFos( fieldId):
    filtered = fc.filter(col("child") == fieldId)
#     print(filtered.take(1))
    if len(filtered.take(1)) == 0:
        return fieldId
    parent = filtered.take(1)[0][0]
    if parent == fieldId:
#         print(parent)
        return parent
    else:
#         print('rec')
        return computeRootFos( parent)

In [13]:
computeRootFos(8105449)

12843

add the root field of study to each paper

In [14]:
from pyspark.sql import Row
pf.rdd.map(lambda row : Row(paper = row[0], fos = row[1], similarity = row[2])).toDF()
fields = pf.select("fos").distinct().collect()
fields = map(lambda field : {'fos' : field[0], 'root' :computeRootFos(field[0])}, fields)
fields = list(fields)

DataFrame[fos: bigint, paper: bigint, similarity: double]

In [15]:
fr = pd.DataFrame(fields)

define helper function to use the root dataframe for getting the root of a field id

In [17]:
def getRootFos(fos):
    return fr[fr["fos"] == 124952713]["root"]

In [31]:
from pyspark.sql.functions import udf
def getRootFos(fos):
    return fr[fr["fos"] == fos]["root"][0]

rootFos_udf = udf(lambda fos : getRootFos(fos))

In [33]:
from pyspark.sql.functions import lit
rootFos_udf(lit(124952713))
getRootFos(124952713)

Column<b'<lambda>(124952713)'>

124952713

## Aggregated new data

In [151]:
# ego country
authorDetails = paa.join(pf, "paper")\
    .select("author", "paper","affiliation", "fos")\
    .rdd.map(lambda row : Row(author = row[0], paper = row[1], affiliation = row[2], fos = getRootFos(row[3]))).toDF()\
    .groupBy("author", "fos", "affiliation")\
    .agg(F.countDistinct("paper"))\
    .join(MAG.dfAff, col("affiliation") == col("id"))\
    .select("author","fos","countryCode","country")

authorDetails

ValueError: RDD is empty

# Filtered tables

In [181]:
# FILTERED_PAA_YEARS
paa = paa.join(papers, col("id") == col("paper"))\
    .filter( (col("year") >= 2007) & (col("year") <= 2017))\
    .select("paper", "author", "affiliation")
paa

DataFrame[paper: int, author: bigint, affiliation: bigint]

In [182]:
# PF_WITH_ROOT
pf = pf.withColumn("root", rootFos_udf("fos"))
pf

DataFrame[paper: int, fos: bigint, similarity: double, root: string]

In [183]:
fos = 10
# FILTERED_PAA_FOS
paa = paa.join(pf, "paper")\
    .filter(col("root") == fos)\
    .select("paper", "author", "affiliation")
paa

DataFrame[paper: int, author: bigint, affiliation: bigint]

## Add test data

In [168]:
from pyspark.sql import Row

In [185]:
tpaa = [(1, 2, "RO"),(1,3, "RO"),(2,3, "RO"),(2,1, "RO"),(3,2, "EN"),(1,5, "EN"),(3,3, "RO"),(3,5, "EN"),(4,7, "IT")]
rdd = spark.sparkContext.parallelize(tpaa)
rdd = rdd.map(lambda x: Row(paper=int(x[0]), author=int(x[1]), countryCode = x[2]))
tpaa = spark.createDataFrame(rdd)
tpaa

DataFrame[author: bigint, countryCode: string, paper: bigint]

In [186]:
tpr = [(1,2), (3, 2), (3,4), (4, 1)]
rdd = spark.sparkContext.parallelize(paa_rdd)
rdd = rdd.map(lambda x: Row(citing=int(x[0]), cited=int(x[1])))
tpr = spark.createDataFrame(rdd)
tpr

DataFrame[cited: bigint, citing: bigint]

## New table head

In [184]:
# EGO_country
ego_country = paa.join(affs, col("affiliation") == col("id"))\
    .select("author", "paper", "affiliation", "countryCode")

In [187]:
# EGO_papers
paa.groupBy("author").count().withColumnRenamed("count","papers")

DataFrame[author: bigint, papers: bigint]

In [54]:
#EGO_citations
paa.join(pr, col("paper") == col("cited"))\
    .groupBy("author")\
    .count().withColumnRenamed("count","citations")

DataFrame[author: bigint, citations: bigint]

In [55]:
#EGO_coauthors
from pyspark.sql.functions import countDistinct
paa.alias("1").join(paa.alias("2"), col("1.paper") == col("2.paper"))\
    .groupBy("1.author")\
    .agg(F.countDistinct("2.author").alias("coauthors"))

DataFrame[author: bigint, coauthors: bigint]

In [100]:
#EGO_edges - numarul de persoane diferite cu care a colaborat ego ? 
from pyspark.sql.functions import countDistinct
paa.alias("1").join(paa.alias("2"), col("1.paper") == col("2.paper"))\
    .filter( col("1.author") != col("2.author"))\
    .groupBy("1.author")\
    .agg(F.countDistinct("2.author"))

DataFrame[author: bigint, count(DISTINCT 2.author): bigint]

In [112]:
# ALTER_domestic
paa = ego_country
paa.alias("1").join(paa.alias("2"), col("1.paper") == col("2.paper"))\
    .filter(col("1.author") != col("2.author"))\
    .filter(col("1.countryCode") == col("2.countryCode"))\
    .groupBy("1.author")\
    .agg(F.countDistinct("2.author").alias("domestics"))

DataFrame[author: bigint, domestics: bigint]

In [111]:
# ALTER_nondomestic
paa = ego_country
paa.alias("1").join(paa.alias("2"), col("1.paper") == col("2.paper"))\
    .filter(col("1.author") != col("2.author"))\
    .filter(col("1.countryCode") == col("2.countryCode"))\
    .groupBy("1.author")\
    .agg(F.countDistinct("2.author").alias("nondomestics"))

DataFrame[author: bigint, nondomestics: bigint]

In [142]:
# ALTER_citations
from pyspark.sql.functions import count
paa.alias("1").join(paa.alias("2"), col("1.paper") == col("2.paper"))\
    .join(paa.alias("3"), col("2.author") == col("3.author"))\
    .join(pr, col("cited") == col("3.paper"))\
    .groupBy(col("1.author"))\
    .agg(F.count("citing").alias("citations"))

DataFrame[author: bigint, citations: bigint]

In [152]:
from pyspark.sql.functions import count, sum
#ALTER_papers - count total number of papers written by coauthors
paa.alias("1").join(paa.alias("2"), col("1.paper") == col("2.paper"))\
    .join(paa.alias("3"), col("2.author") == col("3.author"))\
    .filter(col("3.paper") != col("1.author"))\
    .groupBy(col("1.author").alias("ego"), col("2.author").alias("alter"))\
    .agg(F.countDistinct("3.paper").alias("papers"))\
    .groupBy("ego")\
    .agg(F.sum("papers"))

DataFrame[ego: bigint, sum(papers): bigint]

For below, we need a join that takes left elements that do not meet the condition with 0 default value

In [164]:
# ALTER_EGO_papers
paa.alias("1").join(paa.alias("2"),  col("1.paper") == col("2.paper"))\
    .filter(col("1.author") != col("2.author"))\
    .groupBy("1.author")\
    .agg(F.countDistinct("2.paper").alias("papers_not_alone"))\

DataFrame[author: bigint, papers_not_alone: bigint]

In [165]:
# ALTER_EGO_citations
paa.alias("1").join(paa.alias("2"),  col("1.paper") == col("2.paper"))\
    .filter(col("1.author") != col("2.author"))\
    .join(pr, col("1.paper") == col("cited"))\
    .groupBy("1.author")\
    .agg(F.countDistinct("citing").alias("citations"))

DataFrame[author: bigint, citations: bigint]

In [155]:
# ALTER_country
paa.alias("1").join(paa.alias("2"),  col("1.paper") == col("2.paper"))\
    .filter(col("1.author") != col("2.author"))\
    .groupBy("1.author")\
    .agg(F.countDistinct("2.countryCode").alias("countries"))

DataFrame[author: bigint, countries: bigint]

In [156]:
# ALTER_MAX_papers
paa.alias("1").join(paa.alias("2"), col("1.paper") == col("2.paper"))\
    .join(paa.alias("3"), col("2.author") == col("3.author"))\
    .filter(col("1.paper") != col("3.paper"))\
    .groupBy(col("1.author").alias("ego"), col("2.author").alias("alter"))\
    .agg(F.countDistinct("3.paper").alias("papers"))\
    .groupBy("ego")\
    .agg(F.max("papers"))

DataFrame[ego: bigint, max(papers): bigint]

In [159]:
# ALTER_MAX_citations
paa.alias("1").join(paa.alias("2"),  col("1.paper") == col("2.paper"))\
    .filter(col("1.author") != col("2.author"))\
    .join(paa.alias("3"), col("2.author") == col("3.author"))\
    .filter(col("1.paper") != col("3.paper"))\
    .join(pr, col("1.paper") == col("cited"))\
    .groupBy("1.author", "2.author")\
    .agg(F.countDistinct("citing").alias("citations"))\
    .groupBy("1.author")\
    .agg(F.max("citations"))

DataFrame[author: bigint, max(citations): bigint]