In [4]:
import numpy as np
import pandas as pd

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession

In [11]:
spark = SparkSession\
    .builder\
    .enableHiveSupport()\
    .config("spark.dynamicAllocation.enabled", False)\
    .config("spark.executor.memory", "4g")\
    .getOrCreate()

In [6]:
arrays = [
  ([1, 2], [2, 4]),
  ([1, 1], [-1, -1]),
  ([1,2], [2, 1])  
]

schema = T.StructType([
    T.StructField('a', T.ArrayType(T.IntegerType()), nullable=False),
    T.StructField('b', T.ArrayType(T.IntegerType()), nullable=False)
])

df = spark.createDataFrame(
  pd.DataFrame(
    arrays
  ),
  schema
)

In [8]:
df.show()

+------+--------+
|     a|       b|
+------+--------+
|[1, 2]|  [2, 4]|
|[1, 1]|[-1, -1]|
|[1, 2]|  [2, 1]|
+------+--------+



In [9]:
def cosine_similarity(a, b):
  return float(np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b)))

cosine_similarity_udf = F.udf(cosine_similarity, T.FloatType())

def compute_similarity(df):
  rst = df\
  .withColumn("cos_sim", cosine_similarity_udf(F.col("a"), F.col("b")))\
  .select("cos_sim")
  return rst

df_rst = compute_similarity(df)
df_rst.show()

+-------+
|cos_sim|
+-------+
|    1.0|
|   -1.0|
|    0.8|
+-------+



In [37]:
from tools import compute_similarity
from BaseTest import PysparkTestCase

In [49]:
class ToolsTest(PysparkTestCase):

    def test_compute_similarity(self):
        arrays = [
            ([1, 2], [2, 4], 1.0),
            ([1, 1], [-1, -1], -1.0),
            ([1, 2], [2, 1], 0.8)
        ]

        schema = T.StructType([
            T.StructField('a', T.ArrayType(T.IntegerType()), nullable=False),
            T.StructField('b', T.ArrayType(T.IntegerType()), nullable=False),
            T.StructField('c', T.FloatType(), nullable=False)
        ])

        df = self.spark.createDataFrame(
            pd.DataFrame(
                arrays
            ),
            schema
        )
        df_expect = df.select(F.col('c'))

        df_test = compute_similarity(df)

        self.assertTrue(
            self.is_dataframe_equal(
                df_test,
                df_expect
            )
        )

In [50]:
import unittest

In [51]:
runner = unittest.TextTestRunner()
tool_test = ToolsTest()

In [52]:
runner.run(tool_test.test_compute_similarity())

AttributeError: 'ToolsTest' object has no attribute 'spark'