In [1]:
import os
if "HADOOP_CONF_DIR" in os.environ:
    del os.environ["HADOOP_CONF_DIR"]

In [2]:
"HADOOP_CONF_DIR" in os.environ

False

In [3]:
from kafka import KafkaProducer
import json
from tqdm import tqdm
import time
import random
import socket
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import BooleanType, IntegerType, LongType, StringType, ArrayType, FloatType, StructType, StructField, TimestampType
from pyspark.sql.functions import *
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import PandasUDFType
from jinja2 import Environment, FileSystemLoader


# setting constants
APP_NAME = "producer"
NORMALIZED_APP_NAME = APP_NAME.replace('/', '_').replace(':', '_')

APPS_TMP_DIR = os.path.join(os.getcwd(), "tmp")
APPS_CONF_DIR = os.path.join(os.getcwd(), "conf")
APPS_LOGS_DIR = os.path.join(os.getcwd(), "logs")
LOG4J_PROP_FILE = os.path.join(APPS_CONF_DIR, "pyspark-log4j-{}.properties".format(NORMALIZED_APP_NAME))
LOG_FILE = os.path.join(APPS_LOGS_DIR, 'pyspark-{}.log'.format(NORMALIZED_APP_NAME))
EXTRA_JAVA_OPTIONS = "-Dlog4j.configuration=file://{} -Dspark.hadoop.dfs.replication=1 -Dhttps.protocols=TLSv1.0,TLSv1.1,TLSv1.2,TLSv1.3"\
    .format(LOG4J_PROP_FILE)

LOCAL_IP = socket.gethostbyname(socket.gethostname())

# preparing configuration files from templates
for directory in [APPS_CONF_DIR, APPS_LOGS_DIR, APPS_TMP_DIR]:
    if not os.path.exists(directory):
        os.makedirs(directory)

env = Environment(loader=FileSystemLoader('/opt'))
template = env.get_template("pyspark_log4j.properties.template")
template\
    .stream(logfile=LOG_FILE)\
    .dump(LOG4J_PROP_FILE)

# run spark
spark = SparkSession\
    .builder\
    .appName(APP_NAME)\
    .master("k8s://https://10.32.7.103:6443")\
    .config("spark.driver.host", LOCAL_IP)\
    .config("spark.driver.bindAddress", "0.0.0.0")\
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", '3')\
    .config("spark.memory.fraction", "0.8")\
    .config("spark.memory.storageFraction", "0.6")\
    .config("spark.executor.memory", '3g')\
    .config("spark.driver.memory", "3g")\
    .config("spark.driver.maxResultSize", "1g")\
    .config("spark.kubernetes.memoryOverheadFactor", "0.3")\
    .config("spark.driver.extraJavaOptions", EXTRA_JAVA_OPTIONS)\
    .config("spark.kubernetes.namespace", "sburmistrova-266294")\
    .config("spark.kubernetes.driver.label.appname", APP_NAME)\
    .config("spark.kubernetes.executor.label.appname", APP_NAME)\
    .config("spark.kubernetes.container.image", "node03.st:5000/spark-executor:sburmistrova-266294")\
    .config("spark.local.dir", "/tmp/spark")\
    .config("spark.driver.extraClassPath", "/home/jovyan/shared-data/my-project-name-jar-with-dependencies.jar")\
    .config("spark.executor.extraClassPath", "/home/jovyan/shared-data/my-project-name-jar-with-dependencies.jar")\
    .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.path", "/tmp/spark")\
    .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.readOnly", "false")\
    .config("spark.kubernetes.executor.volumes.hostPath.depdir.mount.path", "/home/jovyan/shared-data")\
    .config("spark.kubernetes.executor.volumes.hostPath.depdir.options.path", "/nfs/shared")\
    .config("spark.kubernetes.executor.volumes.hostPath.depdir.options.type", "Directory")\
    .config("spark.kubernetes.executor.volumes.hostPath.depdir.mount.readOnly", "false")\
    .getOrCreate()

# printing important urls and paths
print("Web UI: {}".format(spark.sparkContext.uiWebUrl))
print("\nlog4j file: {}".format(LOG4J_PROP_FILE))
print("\ndriver log file: {}".format(LOG_FILE))

Web UI: http://10.128.5.219:4040

log4j file: /home/jovyan/nfs-home/conf/pyspark-log4j-producer.properties

driver log file: /home/jovyan/nfs-home/logs/pyspark-producer.log


In [4]:
# spark.stop()

In [5]:
posts_df = spark.read.json("/home/jovyan/shared-data/bigdata20/followers_posts_api_final.json")

In [6]:
posts_df.printSchema()

root
 |-- attachments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- album: struct (nullable = true)
 |    |    |    |-- created: long (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |-- size: long (nullable = true)
 |    |    |    |-- thumb: struct (nullable = true)
 |    |    |    |    |-- access_key: string (nullable = true)
 |    |    |    |    |-- album_id: long (nullable = true)
 |    |    |    |    |-- date: long (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- lat: double (nullable = true)
 |    |    |    |    |-- long: double (nullable = true)
 |    |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |    |-- sizes: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    

In [7]:
posts_df= posts_df.select(col("owner_id").alias("user_id"), "text", col("date").alias("timestamp"))\
    .where("text != ''")

In [8]:
posts_df.show()

+-------+--------------------+----------+
|user_id|                text| timestamp|
+-------+--------------------+----------+
|  57114|Я всё время себя ...|1552061847|
|  57114|Я наверно не тот ...|1552066470|
|  57114|Местами дождь, ме...|1552223531|
|  57114|Ну отчего все так...|1552250945|
|  57114|Друзья, давайте в...|1552309202|
|  57114|Я наверно для жиз...|1552803592|
|  57114|Решив однажды утр...|1553062508|
|  57114|Дождь это же снег...|1553318159|
|  57114|Мне надо куда-то ...|1553449684|
|  57114|Я давно не ищу ид...|1553710452|
|  57114|Es kommt mir vor ...|1554355560|
|  57114|Ищут пожарные, Ищ...|1556554938|
|  57114|Я поломан был нес...|1557346651|
|  57114|Я себя разрываю н...|1557519368|
|  57114|Петербург раствор...|1557565789|
|  57114|Меня пронзает сол...|1558332390|
|  57114|Деньги дают нам ч...|1558764295|
|  39499|        #ЧайнаяЛента|1554622080|
|  39499|Побывали в [club2...|1554742564|
|  68686|Полезный курс для...|1550587946|
+-------+--------------------+----

In [None]:
producer = KafkaProducer(bootstrap_servers="kafka-svc:9092", value_serializer=str.encode)
topic_name = "posts"

for row in tqdm(posts_df.orderBy("timestamp").rdd.toLocalIterator(), total=posts_df.count()):
    value = json.dumps(row.asDict(), ensure_ascii=False)
    producer.send(topic_name, json.dumps(row.asDict(), ensure_ascii=False))
    time.sleep(random.uniform(0.01, 0.2))

  3%|▎         | 5533/189848 [09:51<5:33:36,  9.21it/s] 