<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Join-two-data-frames-which-do-not-have-id-columns" data-toc-modified-id="Join-two-data-frames-which-do-not-have-id-columns-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Join two data frames which do not have id columns</a></span></li><li><span><a href="#Merge-two-different-columns" data-toc-modified-id="Merge-two-different-columns-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Merge two different columns</a></span></li><li><span><a href="#Merge-on-two-different-columns-(using-sc.parallize)" data-toc-modified-id="Merge-on-two-different-columns-(using-sc.parallize)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Merge on two different columns (using sc.parallize)</a></span></li><li><span><a href="#Join-two-dataframes-and-update-values-from-second-dataframe" data-toc-modified-id="Join-two-dataframes-and-update-values-from-second-dataframe-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Join two dataframes and update values from second dataframe</a></span></li></ul></div>

In [2]:
import numpy as np
import pandas as pd
from pyspark.sql.types import *

In [3]:
import pyspark

from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf # @udf("integer") def myfunc(x,y): return x - y
from pyspark.sql import functions as F # stddev format_number date_format, dayofyear, when
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('bhishan').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc) # spark_df = sqlContext.createDataFrame(pandas_df)
sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


# Join two data frames which do not have id columns

In [55]:
sdf1 = spark.createDataFrame([
    (1,2,3,4),(10,20,30,40),(100,200,300,400),                
    ], ("col1","col2","col3","col4"))

sdf1.show()

+----+----+----+----+
|col1|col2|col3|col4|
+----+----+----+----+
|   1|   2|   3|   4|
|  10|  20|  30|  40|
| 100| 200| 300| 400|
+----+----+----+----+



In [56]:
sdf2 = spark.createDataFrame([
    (5,6,7,8),(50,60,70,80),(500,600,700,800),              
    ], ("col5","col6","col7","col8"))
sdf2.show()

+----+----+----+----+
|col5|col6|col7|col8|
+----+----+----+----+
|   5|   6|   7|   8|
|  50|  60|  70|  80|
| 500| 600| 700| 800|
+----+----+----+----+



In [57]:
sdf1 = sdf1.withColumn("id", F.monotonically_increasing_id())
sdf1.show()

+----+----+----+----+-----------+
|col1|col2|col3|col4|         id|
+----+----+----+----+-----------+
|   1|   2|   3|   4| 8589934592|
|  10|  20|  30|  40|17179869184|
| 100| 200| 300| 400|25769803776|
+----+----+----+----+-----------+



In [58]:
sdf2 = sdf2.withColumn("id", F.monotonically_increasing_id())
sdf2.show()

+----+----+----+----+-----------+
|col5|col6|col7|col8|         id|
+----+----+----+----+-----------+
|   5|   6|   7|   8| 8589934592|
|  50|  60|  70|  80|17179869184|
| 500| 600| 700| 800|25769803776|
+----+----+----+----+-----------+



In [59]:
sdf = sdf1.join(sdf2, "id", "inner").drop("id")
sdf.show()

+----+----+----+----+----+----+----+----+
|col1|col2|col3|col4|col5|col6|col7|col8|
+----+----+----+----+----+----+----+----+
|   1|   2|   3|   4|   5|   6|   7|   8|
| 100| 200| 300| 400| 500| 600| 700| 800|
|  10|  20|  30|  40|  50|  60|  70|  80|
+----+----+----+----+----+----+----+----+



# Merge two different columns

In [60]:
df1 = pd.DataFrame({'A':[1,10,100],'left':[1,5,2]})
df1

Unnamed: 0,A,left
0,1,1
1,10,5
2,100,2


In [61]:
df2 = pd.DataFrame({'right':[5,2,1],'B':[555,222,111]})
df2

Unnamed: 0,right,B
0,5,555
1,2,222
2,1,111


In [62]:
df1.merge(df2,left_on='left',right_on='right')

Unnamed: 0,A,left,right,B
0,1,1,1,111
1,10,5,5,555
2,100,2,2,222


In [63]:
df1.dtypes

A       int64
left    int64
dtype: object

In [64]:
schema = StructType([
    StructField('A',IntegerType(),True),
    StructField('left',IntegerType(),True)
    ])

sdf1 = sqlContext.createDataFrame(df1, schema)
sdf1.show()

+---+----+
|  A|left|
+---+----+
|  1|   1|
| 10|   5|
|100|   2|
+---+----+



In [65]:
df2.dtypes

right    int64
B        int64
dtype: object

In [66]:
schema = StructType([
    StructField('right',IntegerType(),True),
    StructField('B',IntegerType(),True)
    ])

sdf2 = sqlContext.createDataFrame(df2, schema)
sdf2.show()

+-----+---+
|right|  B|
+-----+---+
|    5|555|
|    2|222|
|    1|111|
+-----+---+



In [67]:
sdf = sdf1.join(sdf2, sdf1.left == sdf2.right)
sdf.show()

+---+----+-----+---+
|  A|left|right|  B|
+---+----+-----+---+
|  1|   1|    1|111|
| 10|   5|    5|555|
|100|   2|    2|222|
+---+----+-----+---+



# Merge on two different columns (using sc.parallize)

In [68]:
sdf1 = sc.parallelize([['a', 'apple'],
                       ['b', 'banana'],
                       ['c', 'cheese']]
                     ).toDF(['a_id', 'fruit'])
sdf1.show()

+----+------+
|a_id| fruit|
+----+------+
|   a| apple|
|   b|banana|
|   c|cheese|
+----+------+



In [69]:
sdf2 = sc.parallelize([[10, 'a','extra1'],
                       [30, 'c','extra2'],
                       [20, 'b','extra3']]
                     ).toDF(["price", "b_id",'extra'])
sdf2.show()

+-----+----+------+
|price|b_id| extra|
+-----+----+------+
|   10|   a|extra1|
|   30|   c|extra2|
|   20|   b|extra3|
+-----+----+------+



In [70]:
sdf2.dtypes

[('price', 'bigint'), ('b_id', 'string'), ('extra', 'string')]

In [71]:
sdf = sdf1.join(sdf2, sdf1.a_id == sdf2.b_id)
sdf.show()

+----+------+-----+----+------+
|a_id| fruit|price|b_id| extra|
+----+------+-----+----+------+
|   c|cheese|   30|   c|extra2|
|   b|banana|   20|   b|extra3|
|   a| apple|   10|   a|extra1|
+----+------+-----+----+------+



In [72]:
sdf = sdf1.alias('A').join(
    sdf2.alias('B'),
    F.col('A.a_id') == F.col('B.b_id'))
sdf.show()

+----+------+-----+----+------+
|a_id| fruit|price|b_id| extra|
+----+------+-----+----+------+
|   c|cheese|   30|   c|extra2|
|   b|banana|   20|   b|extra3|
|   a| apple|   10|   a|extra1|
+----+------+-----+----+------+



In [73]:
from pyspark.sql.functions import col

# sdf = sdf1.alias('A').join(
#     sdf2.alias('B'),
#     col('A.a_id') == col('B.b_id')
#         ).select([col('A.'+xx) for xx in A.columns] + 
#                  [col('B.b_id'),col('B.price')])


# sdf.show()
# NameError: name 'A' is not defined
# in select, A is not recognized.

In [74]:
sdf = sdf1.alias('A').join(
        sdf2.alias('B'),
        col('A.a_id') == col('B.b_id')
        ).drop('extra')

sdf.show()

+----+------+-----+----+
|a_id| fruit|price|b_id|
+----+------+-----+----+
|   c|cheese|   30|   c|
|   b|banana|   20|   b|
|   a| apple|   10|   a|
+----+------+-----+----+



In [None]:
# sdf = sdf1.alias('A').join(
#         sdf2.alias('B'),
#         'A.a_id' == 'B.b_id'   ===> we need col() function here.
#         ).select(sdf1.columns+['price'])

# # AssertionError: on should be Column or list of Column

In [77]:
sdf1.show(2)

+----+------+
|a_id| fruit|
+----+------+
|   a| apple|
|   b|banana|
+----+------+
only showing top 2 rows



In [78]:
sdf2.show(2)

+-----+----+------+
|price|b_id| extra|
+-----+----+------+
|   10|   a|extra1|
|   30|   c|extra2|
+-----+----+------+
only showing top 2 rows



In [81]:
sdf = sdf1.withColumnRenamed('fruit','myfruit').alias('A').join(
        sdf2.withColumnRenamed('extra','renamed_extra').alias('B'),
        col('A.a_id') == col('B.b_id')
        )

sdf.show()

+----+-------+-----+----+-------------+
|a_id|myfruit|price|b_id|renamed_extra|
+----+-------+-----+----+-------------+
|   c| cheese|   30|   c|       extra2|
|   b| banana|   20|   b|       extra3|
|   a|  apple|   10|   a|       extra1|
+----+-------+-----+----+-------------+



# Join two dataframes and update values from second dataframe

https://stackoverflow.com/questions/49442572/update-a-dataframe-column-with-new-values

In [4]:
data1 = [
  (1, "a"),
  (2, "b"),
  (3, "c")
]
df1 = sqlContext.createDataFrame(data1, ["id", "value"])
df1.show()

+---+-----+
| id|value|
+---+-----+
|  1|    a|
|  2|    b|
|  3|    c|
+---+-----+



In [5]:
data2 = [
  (1, "x"), 
  (2, "y")
]

df2 = sqlContext.createDataFrame(data2, ["id", "value"])
df2.show()

+---+-----+
| id|value|
+---+-----+
|  1|    x|
|  2|    y|
+---+-----+



In [6]:
df1.alias('l').join(df2.alias('r'), on='id', how='left')\
    .select(
        'id',
         F.col('l.value').alias('left_value'),
         F.col('r.value').alias('right_value')
    )\
    .show()

+---+----------+-----------+
| id|left_value|right_value|
+---+----------+-----------+
|  1|         a|          x|
|  3|         c|       null|
|  2|         b|          y|
+---+----------+-----------+



In [7]:
df1.alias('l').join(df2.alias('r'), on='id', how='left')\
    .select(
        'id',
        F.when(
            ~F.isnull(F.col('r.value')),
            F.col('r.value')
        ).otherwise(F.col('l.value')).alias('value')
    )\
    .show()

+---+-----+
| id|value|
+---+-----+
|  1|    x|
|  3|    c|
|  2|    y|
+---+-----+



In [8]:
df1.registerTempTable('df1')
df2.registerTempTable('df2')

query = """
SELECT l.id, 
CASE WHEN r.value IS NOT NULL THEN r.value ELSE l.value END AS value 
FROM df1 l LEFT JOIN df2 r ON l.id = r.id
"""
sqlContext.sql(query).show()

+---+-----+
| id|value|
+---+-----+
|  1|    x|
|  3|    c|
|  2|    y|
+---+-----+

