In [None]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

In [None]:
if IN_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
    !tar xf spark-3.3.2-bin-hadoop3.tgz
    !mv spark-3.3.2-bin-hadoop3 spark
    !pip install -q findspark
    import os
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    os.environ["SPARK_HOME"] = "/content/spark"

mv: cannot move 'spark-3.3.2-bin-hadoop3' to 'spark/spark-3.3.2-bin-hadoop3': Directory not empty


In [None]:
import findspark
findspark.init()

In [None]:
spark_url = 'local'

In [None]:
from pyspark.sql import SparkSession
from itertools import combinations
from pyspark.sql import Row
from pyspark.sql.functions import col, from_json, expr, explode, struct, count
from pyspark.sql.types import ArrayType, StringType
import csv

In [None]:
spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Spark SQL')\
        .getOrCreate()

In [293]:
from pyspark.sql.functions import sum, avg, min, max, count, desc, explode, split, regexp_replace, round, format_number
import re

path = '2023authornet.csv'
df = spark.read.csv(path, header=True, inferSchema=True)

In [294]:
df.printSchema()

root
 |-- authors: string (nullable = true)
 |-- citedby: string (nullable = true)
 |-- author_count: string (nullable = true)



In [295]:
df.describe()

DataFrame[summary: string, authors: string, citedby: string, author_count: string]

In [296]:
df.show(5)

+--------------------+-------+------------+
|             authors|citedby|author_count|
+--------------------+-------+------------+
|['Boobphahom S.',...|    0.0|           2|
|['Chauhan C.', 'K...|    0.0|           7|
|['Satanwat P.', '...|    0.0|           8|
|['Buakaew T.', 'R...|    0.0|           2|
|['Patchaiyappan A...|    0.0|           8|
+--------------------+-------+------------+
only showing top 5 rows



In [297]:
filtered_data = df.withColumn('cited-by', df.citedby.cast('int')).drop('_c0', 'citedby')
df_cleaned = filtered_data[filtered_data['cited-by'] > 1]

In [298]:
df_cleaned.count()

375

In [299]:
df_cleaned.show(5)

+--------------------+------------+--------+
|             authors|author_count|cited-by|
+--------------------+------------+--------+
|['Sereewatthanawu...|           4|       4|
|['Mahardawi B.', ...|           7|       2|
|['Umpreecha C.', ...|           4|       2|
|['Wahyuni D.K.', ...|           8|       2|
|['Nim B.', 'Rahay...|           9|       2|
+--------------------+------------+--------+
only showing top 5 rows



In [300]:
authors_schema = ArrayType(StringType())

df_with_authors_array = df_cleaned.withColumn("authors", from_json(col("authors"), authors_schema))
filtered_rows = df_with_authors_array.filter(expr("size(authors) > 1"))

edges = filtered_rows.rdd.flatMap(
    lambda row: [Row(node1=a, node2=b) for a, b in combinations(row["authors"], 2)]
)

edges_df = spark.createDataFrame(edges)
edges_weighted = edges_df.groupBy("node1", "node2").count().withColumnRenamed("count", "weight")

filtered_edges = edges_weighted.filter(col("weight") > 1)
filtered_edges.printSchema()
filtered_edges.show()

root
 |-- node1: string (nullable = true)
 |-- node2: string (nullable = true)
 |-- weight: long (nullable = false)

+--------------------+------------------+------+
|               node1|             node2|weight|
+--------------------+------------------+------+
|            Ament Z.|          Patki A.|     2|
|Jirawattanasomkul T.|  Likitlersuang S.|     2|
|        Aliyu A.A.A.|         Shinjo J.|     2|
|        Ratchahat S.| Assabumrungrat S.|     2|
|          Bhave V.M.|        Irvin M.R.|     2|
|      Wiriyakijja P.|          Villa A.|     2|
|        Motlagh S.R.|         Khezri R.|     2|
|           Inkong K.|          Linga P.|     2|
|           Jafari S.|       Worobo R.W.|     2|
|              Fan Y.|            Qin J.|     2|
|             Chen Z.|           Wang Q.|     2|
|      Al-Rubaye H.T.|           Maes M.|     3|
|            Kumar V.|          Duhan L.|     2|
|  Kijpaisalratana N.|        Irvin M.R.|     2|
|             Yang C.|          Zhang X.|     3|
|

In [301]:
filtered_edges_without_weight = filtered_edges.drop("weight")
filtered_edges_without_weight.toPandas().to_csv('filtered_authorship_2023.csv', index=False)