In [None]:
!pip install pyspark

In [1]:
import logging
from json import dumps
from random import uniform
from time import sleep

from kafka import KafkaProducer

logging.basicConfig(level=logging.INFO, format=" %(levelname)s %(asctime)s: %(message)s")
log = logging.getLogger(__name__)

In [None]:
!curl http://localhost:9092/v3/clusters

In [None]:
!curl https://raw.githubusercontent.com/f0xtek/covidcab/master/yellow_tripdata_2020-04.csv -o yellow_tripdata_2020-04.csv

In [8]:
from tqdm import tqdm

def produce(csv_file: str, bootstrap_servers: str, topic: str):
    producer = KafkaProducer(
        security_protocol="PLAINTEXT",
        bootstrap_servers=[bootstrap_servers],
        value_serializer=lambda x: dumps(x).encode('utf-8'),
        acks="all",
        retries = 3
    )
    
    pbar = tqdm(total=52750)

    # открываем файл на чтение
    with open(csv_file, 'r') as data_file:
        # пропускаем заголовок
        header = data_file.readline()
        log.info(f'Header is [{header}]') 
        count = 0
        
        while True:
#             sleep(uniform(0.9, 0.9))
            sleep(uniform(0.01, 0.1)) # эмулируем интервал
#             sleep(uniform(0.0001, 0.0005))
            line = data_file.readline().strip()

            if not line:
                log.info("File ended")
                break

            count += 1
            fields = line.split(',') 

            data = {
                'vendor_id': int(fields[0]),
                'tpep_pickup_datetime': fields[1],
                'tpep_dropoff_datetime': fields[2],
                'passenger_count': int(fields[3]),
                'trip_distance': float(fields[4]),
                'ratecode_id': int(fields[5]),
                'store_and_fwd_flag': fields[6],
                'pulocation_id': int(fields[7]),
                'dolocation_id': int(fields[8]),
                'payment_type': int(fields[9]),
                'fare_amount': float(fields[10]),
                'extra': float(fields[11]),
                'mta_tax': float(fields[12]),
                'tip_amount': float(fields[13]),
                'tolls_amount': float(fields[14]),
                'improvement_surcharge': float(fields[15]),
                'total_amount': float(fields[16]),
                'congestion_surcharge': float(fields[17]),
            }

            producer.send(topic=topic, value=data)
            pbar.update(1)
#             log.debug("Line {}: {}".format(count, line.strip()))
#             log.info(f"Line {count} sent")

In [None]:
BOOTSTRAP_SERVERS = 'localhost:9092'
TOPIC_NAME = 'taxi'
DATA_FILE = 'yellow_tripdata_2020-04.csv'

produce(DATA_FILE, BOOTSTRAP_SERVERS, TOPIC_NAME)

 INFO 2023-12-05 19:29:59,429: <BrokerConnection node_id=bootstrap-0 host=localhost:9092 <connecting> [IPv6 ('::1', 9092, 0, 0)]>: connecting to localhost:9092 [('::1', 9092, 0, 0) IPv6]
 INFO 2023-12-05 19:29:59,432: Probing node bootstrap-0 broker version
 INFO 2023-12-05 19:29:59,436: <BrokerConnection node_id=bootstrap-0 host=localhost:9092 <connecting> [IPv6 ('::1', 9092, 0, 0)]>: Connection complete.
 INFO 2023-12-05 19:29:59,547: Broker version identified as 2.5.0
 INFO 2023-12-05 19:29:59,548: Set configuration api_version=(2, 5, 0) to skip auto check_version requests on startup


  0%|                                                 | 0/52750 [00:00<?, ?it/s][A[A INFO 2023-12-05 19:29:59,561: Header is [VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
]
 INFO 2023-12-05 

  0%|▏                                      | 181/52750 [00:11<49:44, 17.61it/s][A[A

  0%|▏                                      | 183/52750 [00:11<53:59, 16.23it/s][A[A

  0%|▏                                      | 186/52750 [00:11<49:20, 17.75it/s][A[A

  0%|▏                                      | 189/52750 [00:11<49:52, 17.57it/s][A[A

  0%|▏                                      | 191/52750 [00:11<49:12, 17.80it/s][A[A

  0%|▏                                      | 194/52750 [00:12<46:00, 19.04it/s][A[A

  0%|▏                                      | 196/52750 [00:12<49:52, 17.56it/s][A[A

  0%|▏                                      | 198/52750 [00:12<52:16, 16.75it/s][A[A

  0%|▏                                      | 200/52750 [00:12<58:35, 14.95it/s][A[A

  0%|▏                                      | 202/52750 [00:12<58:32, 14.96it/s][A[A

  0%|▏                                    | 204/52750 [00:12<1:02:32, 14.00it/s][A[A

  0%|▏                          

  1%|▎                                      | 406/52750 [00:24<44:23, 19.65it/s][A[A

  1%|▎                                      | 409/52750 [00:25<47:58, 18.18it/s][A[A

  1%|▎                                      | 413/52750 [00:25<43:38, 19.99it/s][A[A

  1%|▎                                      | 416/52750 [00:25<55:30, 15.71it/s][A[A

  1%|▎                                      | 419/52750 [00:25<54:20, 16.05it/s][A[A

  1%|▎                                      | 422/52750 [00:25<50:31, 17.26it/s][A[A

  1%|▎                                      | 424/52750 [00:25<50:53, 17.14it/s][A[A

  1%|▎                                      | 426/52750 [00:26<50:52, 17.14it/s][A[A

  1%|▎                                      | 428/52750 [00:26<56:58, 15.30it/s][A[A

  1%|▎                                      | 430/52750 [00:26<55:17, 15.77it/s][A[A

  1%|▎                                      | 433/52750 [00:26<49:14, 17.71it/s][A[A

  1%|▎                          

  1%|▍                                      | 626/52750 [00:38<48:10, 18.03it/s][A[A

  1%|▍                                      | 628/52750 [00:38<51:20, 16.92it/s][A[A

  1%|▍                                      | 630/52750 [00:38<50:35, 17.17it/s][A[A

  1%|▍                                      | 632/52750 [00:38<56:34, 15.36it/s][A[A

  1%|▍                                      | 634/52750 [00:38<57:52, 15.01it/s][A[A

  1%|▍                                      | 636/52750 [00:39<59:14, 14.66it/s][A[A

  1%|▍                                    | 638/52750 [00:39<1:07:03, 12.95it/s][A[A

  1%|▍                                    | 640/52750 [00:39<1:04:12, 13.53it/s][A[A

  1%|▍                                    | 642/52750 [00:39<1:05:15, 13.31it/s][A[A

  1%|▍                                    | 644/52750 [00:39<1:03:25, 13.69it/s][A[A

  1%|▍                                    | 646/52750 [00:39<1:00:18, 14.40it/s][A[A

  1%|▍                          

  2%|▋                                      | 856/52750 [00:51<43:35, 19.84it/s][A[A

  2%|▋                                      | 859/52750 [00:52<49:20, 17.53it/s][A[A

  2%|▋                                      | 862/52750 [00:52<47:32, 18.19it/s][A[A

  2%|▋                                      | 864/52750 [00:52<48:23, 17.87it/s][A[A

  2%|▋                                      | 866/52750 [00:52<51:54, 16.66it/s][A[A

  2%|▋                                      | 868/52750 [00:52<52:56, 16.33it/s][A[A

  2%|▋                                      | 871/52750 [00:52<45:37, 18.95it/s][A[A

  2%|▋                                      | 873/52750 [00:52<49:05, 17.61it/s][A[A

  2%|▋                                      | 875/52750 [00:52<47:32, 18.19it/s][A[A

  2%|▋                                      | 878/52750 [00:53<46:17, 18.67it/s][A[A

  2%|▋                                      | 880/52750 [00:53<51:24, 16.82it/s][A[A

  2%|▋                          

  2%|▊                                     | 1085/52750 [01:05<48:57, 17.59it/s][A[A

  2%|▊                                     | 1087/52750 [01:05<47:37, 18.08it/s][A[A

  2%|▊                                     | 1089/52750 [01:05<58:47, 14.65it/s][A[A

  2%|▊                                     | 1092/52750 [01:05<57:07, 15.07it/s][A[A

  2%|▊                                     | 1094/52750 [01:05<55:15, 15.58it/s][A[A

  2%|▊                                     | 1096/52750 [01:05<54:06, 15.91it/s][A[A

  2%|▊                                     | 1098/52750 [01:05<52:50, 16.29it/s][A[A

  2%|▊                                     | 1102/52750 [01:06<44:53, 19.17it/s][A[A

  2%|▊                                     | 1105/52750 [01:06<40:03, 21.49it/s][A[A

  2%|▊                                     | 1108/52750 [01:06<40:59, 21.00it/s][A[A

  2%|▊                                     | 1111/52750 [01:06<46:37, 18.46it/s][A[A

  2%|▊                          