# Install lib

In [1]:
!pip install kafka-python
%pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [2]:
from kafka import KafkaConsumer
import json
import pandas as pd

In [3]:
data = []

In [4]:
# use this function to print the json with the indent=4
def jprint(data):
  print(json.dumps(data,indent=4))

# Kafka Consumer

In [5]:
consumer = KafkaConsumer(
    '2018',
    bootstrap_servers=['localhost:9092'],
    auto_offset_reset='earliest',
    value_deserializer=lambda x: json.loads(x.decode('utf-8')),
)

print("Starting the consumer...")

for message in consumer:
    data.append(message.value)
    print(f"Received: {message.value}")

print("get all data from 2018, done")

Starting the consumer...
Received: {'id': '201800282', 'source': 1, 'coAuthorship': 3, 'citationCount': '29', 'refCount': '89', 'Class': ['ENER', 'PHYS']}
Received: {'id': '201800276', 'source': 1, 'coAuthorship': 5, 'citationCount': '11', 'refCount': '18', 'Class': ['CHEM', 'BIOC', 'AGRI']}
Received: {'id': '201800044', 'source': 1, 'coAuthorship': 5, 'citationCount': '76', 'refCount': '60', 'Class': ['ENVI', 'CENG', 'MATE', 'ENER']}
Received: {'id': '201800249', 'source': 1, 'coAuthorship': 4, 'citationCount': '9', 'refCount': '39', 'Class': ['EART', 'AGRI']}
Received: {'id': '201800043', 'source': 1, 'coAuthorship': 2303, 'citationCount': '59', 'refCount': '101', 'Class': ['PHYS']}
Received: {'id': '201800271', 'source': 1, 'coAuthorship': 2, 'citationCount': '53', 'refCount': '21', 'Class': ['COMP']}
Received: {'id': '201800285', 'source': 1, 'coAuthorship': 1, 'citationCount': '5', 'refCount': '16', 'Class': ['AGRI', 'ENVI']}
Received: {'id': '201800088', 'source': 1, 'coAuthorshi

KeyboardInterrupt: 

In [6]:
jprint(data)

[
    {
        "id": "201800282",
        "source": 1,
        "coAuthorship": 3,
        "citationCount": "29",
        "refCount": "89",
        "Class": [
            "ENER",
            "PHYS"
        ]
    },
    {
        "id": "201800276",
        "source": 1,
        "coAuthorship": 5,
        "citationCount": "11",
        "refCount": "18",
        "Class": [
            "CHEM",
            "BIOC",
            "AGRI"
        ]
    },
    {
        "id": "201800044",
        "source": 1,
        "coAuthorship": 5,
        "citationCount": "76",
        "refCount": "60",
        "Class": [
            "ENVI",
            "CENG",
            "MATE",
            "ENER"
        ]
    },
    {
        "id": "201800249",
        "source": 1,
        "coAuthorship": 4,
        "citationCount": "9",
        "refCount": "39",
        "Class": [
            "EART",
            "AGRI"
        ]
    },
    {
        "id": "201800043",
        "source": 1,
        "coAuthorship": 2303,
   

# Spark for processing

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data)
df = df.select("id", "publicDate", "source", "citationCount", "coAuthorship", "refCount", "Class") # reordering
df.show(10)

24/05/07 22:52:44 WARN Utils: Your hostname, Pirayans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.57 instead (on interface en0)
24/05/07 22:52:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/07 22:52:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+---------+----------+------+-------------+------------+--------+--------------------+
|       id|publicDate|source|citationCount|coAuthorship|refCount|               Class|
+---------+----------+------+-------------+------------+--------+--------------------+
|201800282|      NULL|     1|           29|           3|      89|        [ENER, PHYS]|
|201800276|      NULL|     1|           11|           5|      18|  [CHEM, BIOC, AGRI]|
|201800044|      NULL|     1|           76|           5|      60|[ENVI, CENG, MATE...|
|201800249|      NULL|     1|            9|           4|      39|        [EART, AGRI]|
|201800043|      NULL|     1|           59|        2303|     101|              [PHYS]|
|201800271|      NULL|     1|           53|           2|      21|              [COMP]|
|201800285|      NULL|     1|            5|           1|      16|        [AGRI, ENVI]|
|201800088|      NULL|     1|           23|          32|     107|        [PHYS, EART]|
|201800075|      NULL|     1|         NULL|

In [8]:
df.count()

800

24/05/07 22:52:59 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [9]:
# cast type accordingly
df = df.withColumn('citationCount', df.citationCount.cast('int'))
df = df.withColumn('coAuthorship', df.coAuthorship.cast('int'))
df = df.withColumn('refCount', df.refCount.cast('int'))

In [10]:
from pyspark.sql.functions import avg, min, max, countDistinct, explode, split, col, round, sum

max_values = df.agg(
    max("citationCount").alias("maxCitation"),
    max("refCount").alias("maxRef"),
    max("coAuthorship").alias("maxCoAuthor")
).collect()[0]

# max value for each feature for normalization
max_citation = max_values["maxCitation"]
max_ref = max_values["maxRef"]
max_coauthor = max_values["maxCoAuthor"]

In [11]:
# find all Class
genre_counts = df.withColumn("Genre", explode(col("Class")))\
                 .groupBy("Genre")\
                 .count()  

print('number of all the class:', genre_counts.count())
genre_counts.show()

number of all the class: 26
+-----+-----+
|Genre|count|
+-----+-----+
| COMP|   86|
| MATE|   70|
| IMMU|   39|
| PHYS|  164|
| BIOC|   93|
| NEUR|   17|
| VETE|   49|
| ENGI|  104|
| MEDI|  161|
| ECON|   11|
| MULT|   90|
| MATH|   18|
| ENVI|   60|
| DECI|   18|
| AGRI|   59|
| ENER|   48|
| NURS|   10|
| CENG|   40|
| SOCI|   24|
| EART|   34|
+-----+-----+
only showing top 20 rows



In [12]:
# explode the class
exploded_df = df.withColumn("Class", explode(col("Class")))
# exploded_df.show(20)

In [13]:
# drop na for invalid rows
cleaned_df = exploded_df.dropna()
print(cleaned_df.count())
cleaned_df.show(5)

1096
+---------+----------+------+-------------+------------+--------+-----+
|       id|publicDate|source|citationCount|coAuthorship|refCount|Class|
+---------+----------+------+-------------+------------+--------+-----+
|201800282|15/11/2018|     1|           29|           3|      89| ENER|
|201800282|15/11/2018|     1|           29|           3|      89| PHYS|
|201800276|17/11/2018|     1|           11|           5|      18| CHEM|
|201800276|17/11/2018|     1|           11|           5|      18| BIOC|
|201800276|17/11/2018|     1|           11|           5|      18| AGRI|
+---------+----------+------+-------------+------------+--------+-----+
only showing top 5 rows



In [14]:
# compute the score for each paper
cleaned_df = cleaned_df.withColumn(
    "Score",
    round(
        col("source") * (
            0.4 * (col("citationCount") / max_citation * 10) +
            0.2 * (col("refCount") / max_ref * 10) +
            0.1 * (col("coAuthorship") / max_coauthor * 10)
        ), 4
    )
)

cleaned_df.show(10)

+---------+----------+------+-------------+------------+--------+-----+------+
|       id|publicDate|source|citationCount|coAuthorship|refCount|Class| Score|
+---------+----------+------+-------------+------------+--------+-----+------+
|201800282|15/11/2018|     1|           29|           3|      89| ENER|1.2043|
|201800282|15/11/2018|     1|           29|           3|      89| PHYS|1.2043|
|201800276|17/11/2018|     1|           11|           5|      18| CHEM|0.3692|
|201800276|17/11/2018|     1|           11|           5|      18| BIOC|0.3692|
|201800276|17/11/2018|     1|           11|           5|      18| AGRI|0.3692|
|201800044|11/12/2018|     1|           76|           5|      60| ENVI|2.1734|
|201800044|11/12/2018|     1|           76|           5|      60| CENG|2.1734|
|201800044|11/12/2018|     1|           76|           5|      60| MATE|2.1734|
|201800044|11/12/2018|     1|           76|           5|      60| ENER|2.1734|
|201800249|01/12/2018|     1|            9|         

In [15]:
from pyspark.sql.functions import to_date, year, quarter, col, sum, round, count
from pyspark.sql.window import Window

cleaned_df = cleaned_df.withColumn("publicDate", to_date(col("publicDate"), "dd/MM/yyyy"))
cleaned_df = cleaned_df.withColumn("Year", year(col("publicDate")))

grouped_df = cleaned_df.groupBy("Class", "Year").agg(
    round(sum("Score"), 4).alias("Total Score"),
    count("id").alias("Paper Count")  
)

# Display the result
grouped_df.show()

+-----+----+-----------+-----------+
|Class|Year|Total Score|Paper Count|
+-----+----+-----------+-----------+
| ENGI|2018|    47.0288|         91|
| BIOC|2018|    57.6577|         81|
| ECON|2018|     2.7482|          8|
| MEDI|2018|    78.5292|        133|
| MULT|2018|    61.2393|         84|
| CENG|2018|     43.745|         37|
| SOCI|2018|    14.0144|         20|
| AGRI|2018|    31.3078|         53|
| NURS|2018|     5.0844|          9|
| CHEM|2018|    56.0725|         66|
| MATE|2018|    53.9218|         56|
| ENVI|2018|    54.3482|         53|
| COMP|2018|    34.6748|         72|
| NEUR|2018|    26.9109|         15|
| PHYS|2018|   119.5089|        137|
| IMMU|2018|    16.9801|         29|
| DECI|2018|     4.3515|         12|
| BUSI|2018|    11.9487|         17|
| EART|2018|    40.5586|         32|
| VETE|2018|     4.4398|         10|
+-----+----+-----------+-----------+
only showing top 20 rows



In [17]:
# Save the DataFrame to a CSV file
grouped_df.write.csv(path="output_2018.csv", mode="overwrite", header=True)

spark.stop()