# Install lib

In [21]:
%pip install kafka-python

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
# use this function to print the json with the indent=4
def jprint(data):
  print(json.dumps(data,indent=4))

# Spark for processing

In [145]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, regexp_replace

# Initialize the Spark session
spark = SparkSession.builder \
    .appName("DSDE2024") \
    .getOrCreate()

# Load the CSV file
df = spark.read.option("header", "true").csv('./2022data.csv')

# Remove brackets and single quotes, then split into array
df = df.withColumn("Class", regexp_replace(col("Class"), "[\\[\\]'']", ""))  # Remove [ ], and '
df = df.withColumn("Class", split(col("Class"), ",\s*"))  # Split into array

# Show the transformed DataFrame to verify
df.show(truncate=False)

+---------+----------+------+------------+-------------+--------+------------------------+
|id       |publicDate|source|coAuthorship|citationCount|refCount|Class                   |
+---------+----------+------+------------+-------------+--------+------------------------+
|202200000|31/12/2022|1     |3           |2            |44      |[BIOC, CENG, ENVI, ENGI]|
|202200001|31/12/2022|1     |7           |5            |39      |[BIOC]                  |
|202200002|31/12/2022|1     |6           |7            |51      |[BIOC]                  |
|202200003|31/12/2022|1     |5           |14           |50      |[CHEM, CENG, ENGI]      |
|202200004|30/12/2022|1     |6           |2            |65      |[PHYS, MATE]            |
|202200005|30/12/2022|1     |6           |2            |55      |[ENER, PHYS]            |
|202200006|30/12/2022|1     |6           |17           |141     |[ENER, PHYS]            |
|202200007|30/12/2022|1     |4           |18           |172     |[ENER, PHYS]            |

  df = df.withColumn("Class", split(col("Class"), ",\s*"))  # Split into array


In [146]:
df = df.dropDuplicates(["id"])

In [147]:
df.count()

3452

In [148]:
# cast type accordingly
df = df.withColumn('citationCount', df.citationCount.cast('int'))
df = df.withColumn('coAuthorship', df.coAuthorship.cast('int'))
df = df.withColumn('refCount', df.refCount.cast('int'))

In [149]:
from pyspark.sql.functions import avg, min, max, countDistinct, explode, split, col, round, sum

max_values = df.agg(
    max("citationCount").alias("maxCitation"),
    max("refCount").alias("maxRef"),
    max("coAuthorship").alias("maxCoAuthor")
).collect()[0]

# max value for each feature for normalization
max_citation = max_values["maxCitation"]
max_ref = max_values["maxRef"]
max_coauthor = max_values["maxCoAuthor"]

In [150]:
# find all Class
genre_counts = df.withColumn("Genre", explode(col("Class")))\
                 .groupBy("Genre")\
                 .count()  

print('number of all the class:', genre_counts.count())
genre_counts.show()

number of all the class: 27
+-----+-----+
|Genre|count|
+-----+-----+
| COMP|  201|
| MATE|  284|
| IMMU|  231|
| ARTS|   70|
| PHYS|  295|
| HEAL|   49|
| PSYC|   44|
| BIOC|  435|
| NEUR|   82|
| VETE|  132|
| ENGI|  394|
| PHAR|  189|
| MEDI| 1059|
| ECON|   52|
| MATH|   83|
| MULT|  302|
| ENVI|  332|
| DECI|   18|
| AGRI|  359|
| ENER|  213|
+-----+-----+
only showing top 20 rows



In [151]:
# explode the class
exploded_df = df.withColumn("Class", explode(col("Class")))
# exploded_df.show(20)

In [152]:
# drop na for invalid rows
cleaned_df = exploded_df.dropna()
print(cleaned_df.count())
cleaned_df.show(5)

5867
+---------+----------+------+------------+-------------+--------+-----+
|       id|publicDate|source|coAuthorship|citationCount|refCount|Class|
+---------+----------+------+------------+-------------+--------+-----+
|202200000|31/12/2022|     1|           3|            2|      44| BIOC|
|202200000|31/12/2022|     1|           3|            2|      44| CENG|
|202200000|31/12/2022|     1|           3|            2|      44| ENVI|
|202200000|31/12/2022|     1|           3|            2|      44| ENGI|
|202200001|31/12/2022|     1|           7|            5|      39| BIOC|
+---------+----------+------+------------+-------------+--------+-----+
only showing top 5 rows



In [153]:
# compute the score for each paper
cleaned_df = cleaned_df.withColumn(
    "Score",
    round(
        col("source") * (
            0.4 * (col("citationCount") / max_citation * 10) +
            0.2 * (col("refCount") / max_ref * 10) +
            0.1 * (col("coAuthorship") / max_coauthor * 10)
        ), 4
    )
)

cleaned_df.show(10)

+---------+----------+------+------------+-------------+--------+-----+------+
|       id|publicDate|source|coAuthorship|citationCount|refCount|Class| Score|
+---------+----------+------+------------+-------------+--------+-----+------+
|202200000|31/12/2022|     1|           3|            2|      44| BIOC|0.1713|
|202200000|31/12/2022|     1|           3|            2|      44| CENG|0.1713|
|202200000|31/12/2022|     1|           3|            2|      44| ENVI|0.1713|
|202200000|31/12/2022|     1|           3|            2|      44| ENGI|0.1713|
|202200001|31/12/2022|     1|           7|            5|      39| BIOC|0.1881|
|202200002|31/12/2022|     1|           6|            7|      51| BIOC|0.2497|
|202200003|31/12/2022|     1|           5|           14|      50| CHEM|0.3207|
|202200003|31/12/2022|     1|           5|           14|      50| CENG|0.3207|
|202200003|31/12/2022|     1|           5|           14|      50| ENGI|0.3207|
|202200004|30/12/2022|     1|           6|          

In [154]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, year, quarter, col, sum as spark_sum, round as spark_round, count
from pyspark.sql.window import Window

# Assuming SparkSession is already created and cleaned_df is preloaded DataFrame
# Convert publicDate from string to date type
cleaned_df = cleaned_df.withColumn("publicDate", to_date(col("publicDate"), "dd/MM/yyyy"))

# Extract Year and Quarter from publicDate
cleaned_df = cleaned_df.withColumn("Year", year(col("publicDate")))
cleaned_df = cleaned_df.withColumn("Quarter", quarter(col("publicDate")))

# Group by Class, Year, and Quarter and perform aggregations
grouped_df = cleaned_df.groupBy("Class", "Year", "Quarter").agg(
    spark_round(spark_sum("Score"), 4).alias("Total Score"),
    count("id").alias("Paper Count")  # Count the number of papers per group
)

# Display the result
grouped_df.show()

+-----+----+-------+-----------+-----------+
|Class|Year|Quarter|Total Score|Paper Count|
+-----+----+-------+-----------+-----------+
| BUSI|2022|      2|     4.3313|         17|
| BUSI|2022|      4|     3.9284|         16|
| MATH|2022|      2|     4.3977|         17|
| COMP|2022|      2|    10.5223|         43|
| EART|2022|      1|     8.9906|         23|
| ECON|2022|      4|     3.3603|         14|
| PSYC|2022|      1|     4.3177|         13|
| PHYS|2022|      2|    46.9506|         78|
| EART|2022|      4|     5.0058|         20|
| PSYC|2022|      2|     2.5913|          8|
| DENT|2022|      3|     2.4832|         16|
| PHAR|2022|      2|    13.8496|         50|
| DENT|2022|      1|     2.8371|         13|
| COMP|2022|      1|     11.558|         35|
| DECI|2022|      2|     0.1027|          1|
| ENER|2022|      1|    27.0945|         50|
| ENVI|2022|      2|    20.7589|         74|
| COMP|2022|      4|    13.0639|         62|
| EART|2022|      2|     5.1816|         19|
| PHYS|202

In [158]:
# Coalesce the DataFrame to 1 partition to avoid multiple part files
grouped_df.coalesce(1).write.csv(path="./test_output.csv", mode="overwrite", header=True)