<a href="https://colab.research.google.com/github/matthewpecsok/data_engineering/blob/main/tutorials/de_streaming_kafka_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install the required kafka packages

In [2]:
output_path = '/content/drive/MyDrive/Colab Notebooks/data_engineering'

In [3]:
!pip install kafka-python

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kafka-python
  Downloading kafka_python-2.0.2-py2.py3-none-any.whl (246 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.5/246.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kafka-python
Successfully installed kafka-python-2.0.2


### Import packages

In [4]:
import os
from datetime import datetime
import time
import threading
import json
from kafka import KafkaProducer
from kafka.errors import KafkaError


## Download and setup Kafka and Zookeeper instances

For demo purposes, the following instances are setup locally:

- Kafka (Brokers: 127.0.0.1:9092)
- Zookeeper (Node: 127.0.0.1:2181)


In [5]:
!curl -sSOL https://downloads.apache.org/kafka/3.4.0/kafka_2.12-3.4.0.tgz
!tar -xzf kafka_2.12-3.4.0.tgz

Kafka with defaults

In [6]:
!./kafka_2.12-3.4.0/bin/zookeeper-server-start.sh -daemon ./kafka_2.12-3.4.0/config/zookeeper.properties
!./kafka_2.12-3.4.0/bin/kafka-server-start.sh -daemon ./kafka_2.12-3.4.0/config/server.properties
!echo "Give the processes 10 seconds to start before proceeding."
!sleep 10

Give the processes 10 seconds to start before proceeding.


Is Kafka running?

In [7]:
!ps -ef | grep kafka

root        1956       1 16 03:55 ?        00:00:01 java -Xmx512M -Xms512M -server -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:+ExplicitGCInvokesConcurrent -XX:MaxInlineLevel=15 -Djava.awt.headless=true -Xlog:gc*:file=/content/kafka_2.12-3.4.0/bin/../logs/zookeeper-gc.log:time,tags:filecount=10,filesize=100M -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dkafka.logs.dir=/content/kafka_2.12-3.4.0/bin/../logs -Dlog4j.configuration=file:./kafka_2.12-3.4.0/bin/../config/log4j.properties -cp /content/kafka_2.12-3.4.0/bin/../libs/activation-1.1.1.jar:/content/kafka_2.12-3.4.0/bin/../libs/aopalliance-repackaged-2.6.1.jar:/content/kafka_2.12-3.4.0/bin/../libs/argparse4j-0.7.0.jar:/content/kafka_2.12-3.4.0/bin/../libs/audience-annotations-0.5.0.jar:/content/kafka_2.12-3.4.0/bin/../libs/commons-cli-1.4.jar:/content/kafka_2.12-3.4.0/bin/../libs/commons-lang3-3.8.1.jar:/content/kafka_2.12

Create the kafka topics with the following specs:

- sample-streaming-data: partitions=1

In [8]:
!./kafka_2.12-3.4.0/bin/kafka-topics.sh --create --bootstrap-server 127.0.0.1:9092 --replication-factor 1 --partitions 1 --topic sample-streaming-data

Created topic sample-streaming-data.


Describe the topic for details on the configuration

In [9]:
!./kafka_2.12-3.4.0/bin/kafka-topics.sh --describe --bootstrap-server 127.0.0.1:9092 --topic sample-streaming-data

Topic: sample-streaming-data	TopicId: 8Rc6azxPSCKDjGyvufXM1w	PartitionCount: 1	ReplicationFactor: 1	Configs: 
	Topic: sample-streaming-data	Partition: 0	Leader: 0	Replicas: 0	Isr: 0


## generator python script

In [10]:
%%writefile generator.py

import sys
args = sys.argv  # a list of the arguments provided (str)
print("running generator.py", args)
iterations = int(args[1])
print(f'iterations: {iterations}')

def error_callback(exc):
    raise Exception('Error while sendig data to kafka: {0}'.format(str(exc)))

def write_to_kafka(topic_name, items):
  from kafka import KafkaProducer

  count=0
  producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'])
  for message, key in items:
    producer.send(topic_name, key=key.encode('utf-8'), value=message.encode('utf-8'), partition=0).add_errback(error_callback)
    count+=1
  producer.flush()
  print("Wrote {0} messages into topic: {1}".format(count, topic_name))

import random
from time import sleep

def generate_data(rows=2):

  for i in range(1,rows):

    index_num = random.randint(0,1000000)
    print(index_num)
    keys = list([f'{index_num}'])
    msg = list([f'hello world!{index_num}'])
    data = zip(keys , msg)

  return data

for i in range(iterations):
  write_to_kafka("sample-streaming-data", generate_data())
  sleep(random.randint(0,10))



Writing generator.py


# retrieve messages

In [11]:
def retrieve_messages(topic='sample-streaming-data',bootstrap_servers = '127.0.0.1:9092',output_path='output_path'):

  from kafka import KafkaConsumer
  from kafka.structs import TopicPartition
  import gspread

  consumer = KafkaConsumer(
      bootstrap_servers=bootstrap_servers,
      auto_offset_reset='earliest',
      group_id='retrieve_last_items_group'
      )###,enable_auto_commit=True)
  # Read the specified partition
  consumer.assign([TopicPartition(topic, 0)])



  for msg in consumer:
      # Open the file in write mode ('a')
      file_path = output_path+'steaming_data.csv'  # Replace with the desired file path
      file = open(file_path, 'a')

      print(f"the msg: {msg.value.decode('utf-8')}")
        # Write content to the file
      file.write(f"{msg.value.decode('utf-8')}\n")
      # Close the file
      file.close()

# write some data

In [19]:
%%script bash --bg

python generator.py 3 

function for retrieve messages. 

In [20]:
retrieve_messages()

the msg: 196997


KeyboardInterrupt: ignored

In [None]:
!cat steaming_data.csv

ConsumerRecord(topic='sample-streaming-data', partition=0, offset=68, timestamp=1685984005605, timestamp_type=0, key=b'hello world!199281', value=b'199281', headers=[], checksum=None, serialized_key_size=18, serialized_value_size=6, serialized_header_size=-1)
ConsumerRecord(topic='sample-streaming-data', partition=0, offset=68, timestamp=1685984005605, timestamp_type=0, key=b'hello world!199281', value=b'199281', headers=[], checksum=None, serialized_key_size=18, serialized_value_size=6, serialized_header_size=-1)
ConsumerRecord(topic='sample-streaming-data', partition=0, offset=69, timestamp=1685984006723, timestamp_type=0, key=b'hello world!383087', value=b'383087', headers=[], checksum=None, serialized_key_size=18, serialized_value_size=6, serialized_header_size=-1)
ConsumerRecord(topic='sample-streaming-data', partition=0, offset=70, timestamp=1685984016847, timestamp_type=0, key=b'hello world!231836', value=b'231836', headers=[], checksum=None, serialized_key_size=18, serialized_v

In [None]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(5000)"))

https://0ws9l2u0epo-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [None]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(9092)"))

https://02zd6cwvt4t3-496ff2e9c6d22116-9092-colab.googleusercontent.com/


In [None]:
#%%write_file recommender.py

import numpy as np

# Sample user-item matrix
user_item_matrix = np.array([
    [4, 5, 0, 5, 0],
    [5, 0, 4, 0, 3],
    [0, 3, 0, 4, 5],
    [4, 0, 5, 0, 4],
    [0, 4, 0, 3, 0]
])

# Calculate item similarities using cosine similarity
def calculate_item_similarities(matrix):
    num_items = matrix.shape[1]
    similarities = np.zeros((num_items, num_items))
    
    for i in range(num_items):
        for j in range(num_items):
            if i != j:
                item_i = matrix[:, i]
                item_j = matrix[:, j]
                similarity = np.dot(item_i, item_j) / (np.linalg.norm(item_i) * np.linalg.norm(item_j))
                similarities[i, j] = similarity
    
    return similarities

# Generate item recommendations for a target user
def generate_item_recommendations(user_id, matrix, similarities, top_k=3):
    user_ratings = matrix[user_id]
    num_items = matrix.shape[1]
    
    item_scores = np.zeros(num_items)
    
    for i in range(num_items):
        if user_ratings[i] == 0:
            item_i_similarity = similarities[i]
            item_i_ratings = matrix[:, i]
            item_scores[i] = np.dot(item_i_similarity, item_i_ratings) / np.sum(item_i_similarity)
            print(item_scores)
    
    #item_scores = #np.argsort(item_scores)[::-1][:top_k]
    print(item_scores)
    
    return item_scores

# Calculate item similarities
item_similarities = calculate_item_similarities(user_item_matrix)

# Generate recommendations for user 2
user_id = 4
recommendations = generate_item_recommendations(user_id, user_item_matrix, item_similarities, top_k=6)

print(f"Recommendations for user {user_id}: {recommendations} with matrix {user_item_matrix[user_id]}")

[1.56287158 0.         0.         0.         0.        ]
[1.56287158 0.         0.         0.         0.        ]
[1.56287158 0.         0.         0.         3.03596272]
[1.56287158 0.         0.         0.         3.03596272]
Recommendations for user 4: [1.56287158 0.         0.         0.         3.03596272] with matrix [0 4 0 3 0]


In [None]:
np.nonzero(recommendations)

(array([0, 4]),)

In [None]:
np.take(recommendations, np.nonzero(recommendations), axis=0)

array([[1.56287158, 3.03596272]])

In [None]:
import numpy as np
arr1 = np.nonzero(recommendations)
arr2 = np.take(recommendations, np.nonzero(recommendations), axis=0)

In [None]:
arr1inds = np.take(recommendations, np.nonzero(recommendations), axis=0).argsort()
sorted_arr1 = arr1[arr1inds[::-1]]
sorted_arr2 = arr2[arr1inds[::-1]]

TypeError: ignored

In [None]:
sorted_arr1

array([ 3.97583144e+00,  3.48842058e+00,  3.01221175e+00,  2.61093858e+00,
        2.43753764e+00,  2.40396995e+00,  2.32398323e+00,  2.19034315e+00,
        2.18064940e+00,  2.16479278e+00,  2.10215695e+00,  2.09097581e+00,
        2.03091065e+00,  2.01582145e+00,  1.97145254e+00,  1.96448400e+00,
        1.88353211e+00,  1.87470075e+00,  1.84159690e+00,  1.79092806e+00,
        1.78035461e+00,  1.74380435e+00,  1.70244858e+00,  1.67513932e+00,
        1.66148327e+00,  1.64881577e+00,  1.62005823e+00,  1.55425777e+00,
        1.53557978e+00,  1.53043486e+00,  1.47431449e+00,  1.39267760e+00,
        1.27604491e+00,  1.27439093e+00,  1.23881365e+00,  1.21070937e+00,
        1.15896704e+00,  1.14355372e+00,  1.03596263e+00,  9.70231796e-01,
        9.46891209e-01,  9.27501260e-01,  9.10105989e-01,  8.60661074e-01,
        8.10277034e-01,  7.81987365e-01,  7.72246716e-01,  6.92631153e-01,
        6.30862217e-01,  6.13289002e-01,  6.11394201e-01,  6.08175330e-01,
        5.37225852e-01,  

In [15]:
from kafka import KafkaAdminClient, KafkaProducer
from kafka.admin import NewTopic

# Kafka broker details
bootstrap_servers = 'localhost:9092'

# Topic details
topic_name = 'my_topic'
num_partitions = 1
replication_factor = 1

# Create Kafka topic
admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_servers)
topic = NewTopic(name=topic_name, num_partitions=num_partitions, replication_factor=replication_factor)
admin_client.create_topics([topic])

# Produce messages to Kafka topic
producer = KafkaProducer(bootstrap_servers=bootstrap_servers)
messages = ['Hello', 'World', 'Kafka', 'Python']

for message in messages:
    producer.send(topic_name, value=message.encode('utf-8'))

# Close Kafka producer
producer.flush()
producer.close()

In [17]:
from kafka import KafkaConsumer

# Kafka broker details
bootstrap_servers = 'localhost:9092'

# Topic details
topic_name = 'my_topic'

# Kafka consumer configuration
consumer = KafkaConsumer(
    topic_name,
    bootstrap_servers=bootstrap_servers,
    group_id='my_consumer_group2',
    auto_offset_reset='earliest',
    #enable_auto_commit=True,
    value_deserializer=lambda x: x.decode('utf-8')
)

# Consume and process messages from Kafka topic
for message in consumer:
    print("Received message:", message.value)

# Close Kafka consumer
consumer.close()




KeyboardInterrupt: ignored