# Install lib

In [1]:
!pip install kafka-python
%pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
from kafka import KafkaConsumer
import json
import pandas as pd

In [3]:
data = []

In [4]:
# use this function to print the json with the indent=4
def jprint(data):
  print(json.dumps(data,indent=4))

# Kafka Consumer

In [6]:
consumer = KafkaConsumer(
    '2023',
    bootstrap_servers=['localhost:9092'],
    auto_offset_reset='earliest',
    value_deserializer=lambda x: json.loads(x.decode('utf-8')),
)

print("Starting the consumer...")

for message in consumer:
    data.append(message.value)
    print(f"Received: {message.value}")

print("get all data from 2023, done")

Starting the consumer...
Received: {'id': '202300198', 'publicDate': '01/11/2023', 'source': 1, 'coAuthorship': 4, 'citationCount': '8', 'refCount': '53', 'Class': ['ENGI']}
Received: {'id': '202300153', 'publicDate': '01/12/2023', 'source': 1, 'coAuthorship': 6, 'citationCount': '1', 'refCount': '67', 'Class': ['PHAR']}
Received: {'id': '202300154', 'publicDate': '01/12/2023', 'source': 1, 'coAuthorship': 9, 'citationCount': '2', 'refCount': '27', 'Class': ['MULT']}
Received: {'id': '202300196', 'publicDate': '05/11/2023', 'source': 1, 'coAuthorship': 2, 'citationCount': '1', 'refCount': '50', 'Class': ['ENER', 'ENGI', 'CENG']}
Received: {'id': '202300162', 'publicDate': '01/12/2023', 'source': 1, 'coAuthorship': 8, 'citationCount': '2', 'refCount': '44', 'Class': ['MULT']}
Received: {'id': '202300165', 'publicDate': '01/12/2023', 'source': 1, 'coAuthorship': 5, 'citationCount': '1', 'refCount': '41', 'Class': ['MULT']}
Received: {'id': '202300191', 'publicDate': '01/12/2023', 'source

KeyboardInterrupt: 

In [7]:
jprint(data)

[
    {
        "id": "202300198",
        "publicDate": "01/11/2023",
        "source": 1,
        "coAuthorship": 4,
        "citationCount": "8",
        "refCount": "53",
        "Class": [
            "ENGI"
        ]
    },
    {
        "id": "202300153",
        "publicDate": "01/12/2023",
        "source": 1,
        "coAuthorship": 6,
        "citationCount": "1",
        "refCount": "67",
        "Class": [
            "PHAR"
        ]
    },
    {
        "id": "202300154",
        "publicDate": "01/12/2023",
        "source": 1,
        "coAuthorship": 9,
        "citationCount": "2",
        "refCount": "27",
        "Class": [
            "MULT"
        ]
    },
    {
        "id": "202300196",
        "publicDate": "05/11/2023",
        "source": 1,
        "coAuthorship": 2,
        "citationCount": "1",
        "refCount": "50",
        "Class": [
            "ENER",
            "ENGI",
            "CENG"
        ]
    },
    {
        "id": "202300162",
        "publ

# Spark for processing

In [8]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data)
df = df.select("id", "publicDate", "source", "citationCount", "coAuthorship", "refCount", "Class") # reordering
df.show(10)

24/05/07 21:23:15 WARN Utils: Your hostname, Pirayans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.57 instead (on interface en0)
24/05/07 21:23:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/07 21:23:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/07 21:23:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

+---------+----------+------+-------------+------------+--------+------------------+
|       id|publicDate|source|citationCount|coAuthorship|refCount|             Class|
+---------+----------+------+-------------+------------+--------+------------------+
|202300198|01/11/2023|     1|            8|           4|      53|            [ENGI]|
|202300153|01/12/2023|     1|            1|           6|      67|            [PHAR]|
|202300154|01/12/2023|     1|            2|           9|      27|            [MULT]|
|202300196|05/11/2023|     1|            1|           2|      50|[ENER, ENGI, CENG]|
|202300162|01/12/2023|     1|            2|           8|      44|            [MULT]|
|202300165|01/12/2023|     1|            1|           5|      41|            [MULT]|
|202300191|01/12/2023|     1|            2|           6|      53|      [CENG, CHEM]|
|202300131|01/12/2023|     1|            3|           5|      46|            [MULT]|
|202300136|01/12/2023|     1|            2|           7|      26|

24/05/07 21:23:26 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [9]:
df.count()

301

In [10]:
# cast type accordingly
df = df.withColumn('citationCount', df.citationCount.cast('int'))
df = df.withColumn('coAuthorship', df.coAuthorship.cast('int'))
df = df.withColumn('refCount', df.refCount.cast('int'))

In [11]:
from pyspark.sql.functions import avg, min, max, countDistinct, explode, split, col, round, sum

max_values = df.agg(
    max("citationCount").alias("maxCitation"),
    max("refCount").alias("maxRef"),
    max("coAuthorship").alias("maxCoAuthor")
).collect()[0]

# max value for each feature for normalization
max_citation = max_values["maxCitation"]
max_ref = max_values["maxRef"]
max_coauthor = max_values["maxCoAuthor"]

In [12]:
# find all Class
genre_counts = df.withColumn("Genre", explode(col("Class")))\
                 .groupBy("Genre")\
                 .count()  

print('number of all the class:', genre_counts.count())
genre_counts.show()

number of all the class: 27
+-----+-----+
|Genre|count|
+-----+-----+
| MATE|   24|
| HEAL|    7|
| BIOC|   25|
| ENGI|   27|
| PHAR|    8|
| MEDI|   59|
| ECON|    3|
| MULT|   93|
| MATH|    4|
| ENVI|   26|
| DECI|    3|
| AGRI|   26|
| ENER|   15|
| CENG|   20|
| CHEM|   33|
| DENT|   20|
| BUSI|    6|
| IMMU|    7|
| ARTS|    1|
| PHYS|   22|
+-----+-----+
only showing top 20 rows



In [13]:
# explode the class
exploded_df = df.withColumn("Class", explode(col("Class")))
# exploded_df.show(20)

In [14]:
# drop na for invalid rows
cleaned_df = exploded_df.dropna()
print(cleaned_df.count())
cleaned_df.show(5)

446
+---------+----------+------+-------------+------------+--------+-----+
|       id|publicDate|source|citationCount|coAuthorship|refCount|Class|
+---------+----------+------+-------------+------------+--------+-----+
|202300198|01/11/2023|     1|            8|           4|      53| ENGI|
|202300153|01/12/2023|     1|            1|           6|      67| PHAR|
|202300154|01/12/2023|     1|            2|           9|      27| MULT|
|202300196|05/11/2023|     1|            1|           2|      50| ENER|
|202300196|05/11/2023|     1|            1|           2|      50| ENGI|
+---------+----------+------+-------------+------------+--------+-----+
only showing top 5 rows



In [15]:
# compute the score for each paper
cleaned_df = cleaned_df.withColumn(
    "Score",
    round(
        col("source") * (
            0.4 * (col("citationCount") / max_citation * 10) +
            0.2 * (col("refCount") / max_ref * 10) +
            0.1 * (col("coAuthorship") / max_coauthor * 10)
        ), 4
    )
)

cleaned_df.show(10)

+---------+----------+------+-------------+------------+--------+-----+------+
|       id|publicDate|source|citationCount|coAuthorship|refCount|Class| Score|
+---------+----------+------+-------------+------------+--------+-----+------+
|202300198|01/11/2023|     1|            8|           4|      53| ENGI| 1.686|
|202300153|01/12/2023|     1|            1|           6|      67| PHAR|0.6129|
|202300154|01/12/2023|     1|            2|           9|      27| MULT| 0.516|
|202300196|05/11/2023|     1|            1|           2|      50| ENER|0.4986|
|202300196|05/11/2023|     1|            1|           2|      50| ENGI|0.4986|
|202300196|05/11/2023|     1|            1|           2|      50| CENG|0.4986|
|202300162|01/12/2023|     1|            2|           8|      44| MULT|0.6281|
|202300165|01/12/2023|     1|            1|           5|      41| MULT|0.4403|
|202300191|01/12/2023|     1|            2|           6|      53| CENG|0.6869|
|202300191|01/12/2023|     1|            2|         

In [16]:
from pyspark.sql.functions import to_date, year, quarter, col, sum, round, count
from pyspark.sql.window import Window

cleaned_df = cleaned_df.withColumn("publicDate", to_date(col("publicDate"), "dd/MM/yyyy"))
cleaned_df = cleaned_df.withColumn("Year", year(col("publicDate")))

grouped_df = cleaned_df.groupBy("Class", "Year").agg(
    round(sum("Score"), 4).alias("Total Score"),
    count("id").alias("Paper Count")  
)

# Display the result
grouped_df.show()

+-----+----+-------+-----------+-----------+----------------------+
|Class|Year|Quarter|Total Score|Paper Count|Cumulative Total Score|
+-----+----+-------+-----------+-----------+----------------------+
| AGRI|2023|      3|     2.1653|          3|                2.1653|
| AGRI|2023|      4|    17.7913|         23|    19.956599999999998|
| ARTS|2023|      3|     0.9772|          1|                0.9772|
| BIOC|2023|      3|      0.807|          3|                 0.807|
| BIOC|2023|      4|    22.2485|         22|               23.0555|
| BUSI|2023|      3|     0.8867|          1|                0.8867|
| BUSI|2023|      4|     3.1622|          5|                4.0489|
| CENG|2023|      4|    17.8755|         20|               17.8755|
| CHEM|2023|      3|     0.5939|          1|                0.5939|
| CHEM|2023|      4|    20.2843|         31|    20.878200000000003|
| COMP|2023|      4|     6.2661|          8|                6.2661|
| DECI|2023|      4|     1.7423|          3|    

In [17]:
# Save the DataFrame to a CSV file
cumulative_df.write.csv(path="output_2023.csv", mode="overwrite", header=True)

spark.stop()