# Install lib

In [1]:
%pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
# use this function to print the json with the indent=4
def jprint(data):
  print(json.dumps(data,indent=4))

# Spark for processing

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, regexp_replace

# Initialize the Spark session
spark = SparkSession.builder \
    .appName("DSDE2024") \
    .getOrCreate()

# Load the CSV file
df = spark.read.option("header", "true").csv('./output.csv')

# Remove brackets and single quotes, then split into array
df = df.withColumn("Class", regexp_replace(col("Class"), "[\\[\\]'']", ""))  # Remove [ ], and '
df = df.withColumn("Class", split(col("Class"), ",\s*"))  # Split into array

# Show the transformed DataFrame to verify
df.show(truncate=False)

24/05/09 15:49:58 WARN Utils: Your hostname, Pirayans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.203.105.253 instead (on interface en0)
24/05/09 15:49:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/09 15:49:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+-----+----------+------+------------+-------------+--------+------+
|id   |publicDate|source|coAuthorship|citationCount|refCount|Class |
+-----+----------+------+------------+-------------+--------+------+
|N0001|14/12/2020|1.5   |19          |2            |44      |[MEDI]|
|N0002|25/11/2020|1.5   |14          |5            |37      |[MEDI]|
|N0003|09/12/2020|1.5   |29          |5            |44      |[MEDI]|
|N0004|26/10/2020|1.5   |6           |1            |23      |[MEDI]|
|N0005|16/11/2020|1.5   |12          |5            |53      |[MEDI]|
|N0006|23/12/2020|1.5   |18          |5            |52      |[MEDI]|
|N0007|15/12/2020|1.5   |11          |5            |70      |[MEDI]|
|N0008|01/12/2020|1.5   |5           |3            |38      |[MEDI]|
|N0009|21/10/2020|1.5   |20          |5            |43      |[MEDI]|
|N0010|14/12/2020|1.5   |9           |33           |29      |[MEDI]|
|N0011|21/12/2018|1.5   |14          |5            |54      |[BIOC]|
|N0012|21/12/2018|1.5   |3        

In [4]:
df = df.dropDuplicates(["id"])

In [5]:
df.count()

1200

24/05/09 15:50:11 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
# cast type accordingly
df = df.withColumn('citationCount', df.citationCount.cast('int'))
df = df.withColumn('coAuthorship', df.coAuthorship.cast('int'))
df = df.withColumn('refCount', df.refCount.cast('int'))

In [7]:
from pyspark.sql.functions import avg, min, max, countDistinct, explode, split, col, round, sum

max_values = df.agg(
    max("citationCount").alias("maxCitation"),
    max("refCount").alias("maxRef"),
    max("coAuthorship").alias("maxCoAuthor")
).collect()[0]

# max value for each feature for normalization
max_citation = max_values["maxCitation"]
max_ref = max_values["maxRef"]
max_coauthor = max_values["maxCoAuthor"]

In [8]:
# find all Class
genre_counts = df.withColumn("Genre", explode(col("Class")))\
                 .groupBy("Genre")\
                 .count()  

print('number of all the class:', genre_counts.count())
genre_counts.show()

number of all the class: 7
+-----+-----+
|Genre|count|
+-----+-----+
| COMP|   60|
| IMMU|   60|
| PHYS|   60|
| BIOC|  120|
| NEUR|   60|
| MEDI|  780|
| CHEM|   60|
+-----+-----+



In [9]:
# explode the class
exploded_df = df.withColumn("Class", explode(col("Class")))
# exploded_df.show(20)

In [10]:
# drop na for invalid rows
cleaned_df = exploded_df.dropna()
print(cleaned_df.count())
cleaned_df.show(5)

1200
+-----+----------+------+------------+-------------+--------+-----+
|   id|publicDate|source|coAuthorship|citationCount|refCount|Class|
+-----+----------+------+------------+-------------+--------+-----+
|N0001|14/12/2020|   1.5|          19|            2|      44| MEDI|
|N0002|25/11/2020|   1.5|          14|            5|      37| MEDI|
|N0003|09/12/2020|   1.5|          29|            5|      44| MEDI|
|N0004|26/10/2020|   1.5|           6|            1|      23| MEDI|
|N0005|16/11/2020|   1.5|          12|            5|      53| MEDI|
+-----+----------+------+------------+-------------+--------+-----+
only showing top 5 rows



In [11]:
# compute the score for each paper
cleaned_df = cleaned_df.withColumn(
    "Score",
    round(
        col("source") * (
            0.4 * (col("citationCount") / max_citation * 10) +
            0.2 * (col("refCount") / max_ref * 10) +
            0.1 * (col("coAuthorship") / max_coauthor * 10)
        ), 4
    )
)

cleaned_df.show(10)

+-----+----------+------+------------+-------------+--------+-----+------+
|   id|publicDate|source|coAuthorship|citationCount|refCount|Class| Score|
+-----+----------+------+------------+-------------+--------+-----+------+
|N0001|14/12/2020|   1.5|          19|            2|      44| MEDI|1.2426|
|N0002|25/11/2020|   1.5|          14|            5|      37| MEDI|1.6426|
|N0003|09/12/2020|   1.5|          29|            5|      44| MEDI|1.8165|
|N0004|26/10/2020|   1.5|           6|            1|      23| MEDI|0.6301|
|N0005|16/11/2020|   1.5|          12|            5|      53| MEDI|1.9369|
|N0006|23/12/2020|   1.5|          18|            5|      52| MEDI|1.9352|
|N0007|15/12/2020|   1.5|          11|            5|      70| MEDI|2.2528|
|N0008|01/12/2020|   1.5|           5|            3|      38| MEDI|1.2722|
|N0009|21/10/2020|   1.5|          20|            5|      43| MEDI|1.7722|
|N0010|14/12/2020|   1.5|           9|           33|      29| MEDI|6.5693|
+-----+----------+------+

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, year, quarter, col, sum as spark_sum, round as spark_round, count
from pyspark.sql.window import Window

# Assuming SparkSession is already created and cleaned_df is preloaded DataFrame
# Convert publicDate from string to date type
cleaned_df = cleaned_df.withColumn("publicDate", to_date(col("publicDate"), "dd/MM/yyyy"))

# Extract Year and Quarter from publicDate
cleaned_df = cleaned_df.withColumn("Year", year(col("publicDate")))
cleaned_df = cleaned_df.withColumn("Quarter", quarter(col("publicDate")))

# Group by Class, Year, and Quarter and perform aggregations
grouped_df = cleaned_df.groupBy("Class", "Year", "Quarter").agg(
    spark_round(spark_sum("Score"), 4).alias("Total Score"),
    count("id").alias("Paper Count")  # Count the number of papers per group
)

# Display the result
grouped_df.show()

+-----+----+-------+-----------+-----------+
|Class|Year|Quarter|Total Score|Paper Count|
+-----+----+-------+-----------+-----------+
| IMMU|2019|      3|     2.3568|          1|
| NEUR|2020|      2|     1.8807|          1|
| MEDI|2019|      4|   270.1514|        122|
| PHYS|2018|      4|    23.7444|         10|
| MEDI|2023|      3|      1.208|          2|
| PHYS|2023|      1|     7.0614|          1|
| CHEM|2021|      4|    30.0819|          9|
| MEDI|2021|      2|      4.196|          3|
| COMP|2022|      4|    32.7025|         10|
| MEDI|2023|      4|   640.2262|        117|
| PHYS|2022|      4|    34.0808|         10|
| NEUR|2021|      4|    32.2437|         10|
| COMP|2020|      4|    22.6319|          8|
| IMMU|2022|      3|     7.7949|          1|
| BIOC|2019|      4|     37.203|         18|
| MEDI|2023|      2|    11.4988|          8|
| IMMU|2023|      4|    48.8768|         10|
| BIOC|2023|      4|    87.5925|         17|
| NEUR|2023|      2|      1.808|          1|
| IMMU|201

In [13]:
# Coalesce the DataFrame to 1 partition to avoid multiple part files
grouped_df.coalesce(1).write.csv(path="./nauture.csv", mode="overwrite", header=True)