In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Generator").getOrCreate()

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
import random

def data_generator():
    for i in range(1,1000000):
        yield (i, f"el_{i}", random.random())

schema1 = StructType([
    StructField('id', IntegerType(), True),
    StructField('name_1', StringType(), True),
    StructField('value_1', DoubleType(), True)
])

data = list(data_generator())
df1 = spark.createDataFrame(data, schema=schema1)
display(df1.head(15))

id,name_1,value_1
1,el_1,0.9649748028076188
2,el_2,0.9262109888449775
3,el_3,0.6549530011755194
4,el_4,0.6873311516388818
5,el_5,0.6927372993845264
6,el_6,0.7220469744103468
7,el_7,0.9103590237894325
8,el_8,0.5516337263405837
9,el_9,0.8408718587345965
10,el_10,0.771309938018883


In [0]:
def data_generator():
    for i in range(1, 1000000):
        yield (i, f"el_{i%5}", random.random())

data = list(data_generator())

schema2 = StructType([
    StructField('id', IntegerType(), True),
    StructField('name_2', StringType(), True),
    StructField('value_2', DoubleType(), True),
])

data = list(data_generator())
df2 = spark.createDataFrame(data, schema=schema2)
display(df2.head(15))

id,name_2,value_2
1,el_1,0.0238704949725119
2,el_2,0.9975325418127892
3,el_3,0.0879090868016193
4,el_4,0.7540062442621721
5,el_0,0.8939808800672981
6,el_1,0.2772099616628865
7,el_2,0.5854404580506946
8,el_3,0.7960072303842086
9,el_4,0.6609263465868807
10,el_0,0.2858379214824995


In [0]:
# INNER JOIN

inner_join = df1.join(df2, df1.name_1 == df2.name_2, how='inner').dropDuplicates()
display(inner_join.head(15))

id,name_1,value_1,id.1,name_2,value_2
3,el_3,0.6549530011755194,3,el_3,0.0879090868016193
3,el_3,0.6549530011755194,8,el_3,0.7960072303842086
3,el_3,0.6549530011755194,13,el_3,0.5536661244195987
3,el_3,0.6549530011755194,18,el_3,0.4891586780008395
3,el_3,0.6549530011755194,23,el_3,0.8490749432844704
3,el_3,0.6549530011755194,28,el_3,0.3766613340947916
3,el_3,0.6549530011755194,33,el_3,0.3371268195970124
3,el_3,0.6549530011755194,38,el_3,0.5607556289234175
3,el_3,0.6549530011755194,43,el_3,0.2945374093128288
3,el_3,0.6549530011755194,48,el_3,0.6646834952198214


In [0]:
# LEFT JOIN

left_join = df1.join(df2, df1.name_1 == df2.name_2, how='left')
display(left_join.head(15))

id,name_1,value_1,id.1,name_2,value_2
10,el_10,0.771309938018883,,,
124931,el_124931,0.4747831697230343,,,
124932,el_124932,0.3707347236973053,,,
124934,el_124934,0.288437952449088,,,
124942,el_124942,0.8911683291114412,,,
14,el_14,0.8375661851837626,,,
249864,el_249864,0.5121321057915621,,,
249870,el_249870,0.8439538714190276,,,
3,el_3,0.6549530011755194,3.0,el_3,0.0879090868016193
3,el_3,0.6549530011755194,8.0,el_3,0.7960072303842086
