# Install lib

In [1]:
!pip install kafka-python
%pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
from kafka import KafkaConsumer
import json
import pandas as pd

In [3]:
data = []

In [4]:
# use this function to print the json with the indent=4
def jprint(data):
  print(json.dumps(data,indent=4))

# Kafka Consumer

In [6]:
consumer = KafkaConsumer(
    '2022',
    bootstrap_servers=['localhost:9092'],
    auto_offset_reset='earliest',
    value_deserializer=lambda x: json.loads(x.decode('utf-8')),
)

print("Starting the consumer...")

for message in consumer:
    data.append(message.value)
    print(f"Received: {message.value}")

print("get all data from 2022, done")

Starting the consumer...
Received: {'id': '202200213', 'publicDate': '01/12/2022', 'source': 1, 'coAuthorship': 10, 'citationCount': '3', 'refCount': '29', 'Class': ['MEDI']}
Received: {'id': '202200021', 'publicDate': '19/12/2022', 'source': 1, 'coAuthorship': 2, 'citationCount': '1', 'refCount': '11', 'Class': ['COMP']}
Received: {'id': '202200019', 'publicDate': '21/12/2022', 'source': 1, 'coAuthorship': 4, 'citationCount': '1', 'refCount': '64', 'Class': ['EART']}
Received: {'id': '202200026', 'publicDate': '16/12/2022', 'source': 1, 'coAuthorship': 8, 'citationCount': '2', 'refCount': '62', 'Class': ['MEDI', 'IMMU']}
Received: {'id': '202200214', 'publicDate': '01/12/2022', 'source': 1, 'coAuthorship': 4, 'citationCount': '4', 'refCount': '42', 'Class': ['COMP']}
Received: {'id': '202200010', 'publicDate': '28/12/2022', 'source': 1, 'coAuthorship': 8, 'citationCount': '1', 'refCount': '167', 'Class': ['HEAL', 'MEDI']}
Received: {'id': '202200222', 'publicDate': '01/12/2022', 'sour

KeyboardInterrupt: 

In [7]:
jprint(data)

[
    {
        "id": "202200213",
        "publicDate": "01/12/2022",
        "source": 1,
        "coAuthorship": 10,
        "citationCount": "3",
        "refCount": "29",
        "Class": [
            "MEDI"
        ]
    },
    {
        "id": "202200021",
        "publicDate": "19/12/2022",
        "source": 1,
        "coAuthorship": 2,
        "citationCount": "1",
        "refCount": "11",
        "Class": [
            "COMP"
        ]
    },
    {
        "id": "202200019",
        "publicDate": "21/12/2022",
        "source": 1,
        "coAuthorship": 4,
        "citationCount": "1",
        "refCount": "64",
        "Class": [
            "EART"
        ]
    },
    {
        "id": "202200026",
        "publicDate": "16/12/2022",
        "source": 1,
        "coAuthorship": 8,
        "citationCount": "2",
        "refCount": "62",
        "Class": [
            "MEDI",
            "IMMU"
        ]
    },
    {
        "id": "202200214",
        "publicDate": "01/12/202

# Spark for processing

In [9]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data)
df = df.select("id", "publicDate", "source", "citationCount", "coAuthorship", "refCount", "Class") # reordering
df.show(10)

                                                                                

+---------+----------+------+-------------+------------+--------+------------+
|       id|publicDate|source|citationCount|coAuthorship|refCount|       Class|
+---------+----------+------+-------------+------------+--------+------------+
|202200213|01/12/2022|     1|            3|          10|      29|      [MEDI]|
|202200021|19/12/2022|     1|            1|           2|      11|      [COMP]|
|202200019|21/12/2022|     1|            1|           4|      64|      [EART]|
|202200026|16/12/2022|     1|            2|           8|      62|[MEDI, IMMU]|
|202200214|01/12/2022|     1|            4|           4|      42|      [COMP]|
|202200010|28/12/2022|     1|            1|           8|     167|[HEAL, MEDI]|
|202200222|01/12/2022|     1|            4|          10|      76|[CENG, MATE]|
|202200028|16/12/2022|     1|            2|           9|      40|[MEDI, NEUR]|
|202200225|01/12/2022|     1|            0|           8|      31|      [MEDI]|
|202200017|22/12/2022|     1|         NULL|         

In [10]:
df.count()

301

In [11]:
# cast type accordingly
df = df.withColumn('citationCount', df.citationCount.cast('int'))
df = df.withColumn('coAuthorship', df.coAuthorship.cast('int'))
df = df.withColumn('refCount', df.refCount.cast('int'))

In [12]:
from pyspark.sql.functions import avg, min, max, countDistinct, explode, split, col, round, sum

max_values = df.agg(
    max("citationCount").alias("maxCitation"),
    max("refCount").alias("maxRef"),
    max("coAuthorship").alias("maxCoAuthor")
).collect()[0]

# max value for each feature for normalization
max_citation = max_values["maxCitation"]
max_ref = max_values["maxRef"]
max_coauthor = max_values["maxCoAuthor"]

In [13]:
# find all Class
genre_counts = df.withColumn("Genre", explode(col("Class")))\
                 .groupBy("Genre")\
                 .count()  

print('number of all the class:', genre_counts.count())
genre_counts.show()

number of all the class: 27
+-----+-----+
|Genre|count|
+-----+-----+
| COMP|   21|
| IMMU|   16|
| MATE|   23|
| HEAL|    3|
| PHYS|   25|
| BIOC|   32|
| NEUR|    4|
| VETE|   13|
| ENGI|   44|
| MEDI|   86|
| MULT|   51|
| ENVI|   23|
| AGRI|   24|
| ENER|   20|
| NURS|    5|
| CENG|   25|
| SOCI|   19|
| EART|    7|
| CHEM|   31|
| BUSI|    4|
+-----+-----+
only showing top 20 rows



24/05/07 21:12:07 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [14]:
# explode the class
exploded_df = df.withColumn("Class", explode(col("Class")))
# exploded_df.show(20)

In [15]:
# drop na for invalid rows
cleaned_df = exploded_df.dropna()
print(cleaned_df.count())
cleaned_df.show(5)

503
+---------+----------+------+-------------+------------+--------+-----+
|       id|publicDate|source|citationCount|coAuthorship|refCount|Class|
+---------+----------+------+-------------+------------+--------+-----+
|202200213|01/12/2022|     1|            3|          10|      29| MEDI|
|202200021|19/12/2022|     1|            1|           2|      11| COMP|
|202200019|21/12/2022|     1|            1|           4|      64| EART|
|202200026|16/12/2022|     1|            2|           8|      62| MEDI|
|202200026|16/12/2022|     1|            2|           8|      62| IMMU|
+---------+----------+------+-------------+------------+--------+-----+
only showing top 5 rows



In [16]:
# compute the score for each paper
cleaned_df = cleaned_df.withColumn(
    "Score",
    round(
        col("source") * (
            0.4 * (col("citationCount") / max_citation * 10) +
            0.2 * (col("refCount") / max_ref * 10) +
            0.1 * (col("coAuthorship") / max_coauthor * 10)
        ), 4
    )
)

cleaned_df.show(10)

+---------+----------+------+-------------+------------+--------+-----+------+
|       id|publicDate|source|citationCount|coAuthorship|refCount|Class| Score|
+---------+----------+------+-------------+------------+--------+-----+------+
|202200213|01/12/2022|     1|            3|          10|      29| MEDI|0.3335|
|202200021|19/12/2022|     1|            1|           2|      11| COMP|0.1191|
|202200019|21/12/2022|     1|            1|           4|      64| EART|0.4575|
|202200026|16/12/2022|     1|            2|           8|      62| MEDI|0.4947|
|202200026|16/12/2022|     1|            2|           8|      62| IMMU|0.4947|
|202200214|01/12/2022|     1|            4|           4|      42| COMP| 0.462|
|202200010|28/12/2022|     1|            1|           8|     167| HEAL|1.1153|
|202200010|28/12/2022|     1|            1|           8|     167| MEDI|1.1153|
|202200222|01/12/2022|     1|            4|          10|      76| CENG|0.6811|
|202200222|01/12/2022|     1|            4|         

In [17]:
from pyspark.sql.functions import to_date, year, quarter, col, sum, round, count
from pyspark.sql.window import Window

cleaned_df = cleaned_df.withColumn("publicDate", to_date(col("publicDate"), "dd/MM/yyyy"))
cleaned_df = cleaned_df.withColumn("Year", year(col("publicDate")))

grouped_df = cleaned_df.groupBy("Class", "Year").agg(
    round(sum("Score"), 4).alias("Total Score"),
    count("id").alias("Paper Count")  
)

# Display the result
grouped_df.show()

+-----+----+-------+-----------+-----------+----------------------+
|Class|Year|Quarter|Total Score|Paper Count|Cumulative Total Score|
+-----+----+-------+-----------+-----------+----------------------+
| AGRI|2022|      4|    12.5998|         24|               12.5998|
| ARTS|2022|      4|     1.9287|          4|                1.9287|
| BIOC|2022|      4|    16.4471|         31|               16.4471|
| BUSI|2022|      4|     1.8913|          4|                1.8913|
| CENG|2022|      4|    17.6069|         25|               17.6069|
| CHEM|2022|      4|    18.0085|         31|               18.0085|
| COMP|2022|      4|     8.8116|         20|                8.8116|
| DECI|2022|      4|     2.2856|          4|                2.2856|
| DENT|2022|      4|     0.3637|          1|                0.3637|
| EART|2022|      4|     3.1225|          7|                3.1225|
| ECON|2022|      4|     0.6407|          2|                0.6407|
| ENER|2022|      4|    12.6482|         20|    

In [18]:
# Save the DataFrame to a CSV file
cumulative_df.write.csv(path="output_2022.csv", mode="overwrite", header=True)

spark.stop()