In [30]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('').getOrCreate()

sdf = spark.read.csv('../data/imputation.txt', sep='\t', inferSchema=True, header=True)

In [31]:
sdf.head()

Row(id=0, time='0:00:00', value=0.0)

In [32]:
sdf.describe().printSchema()

root
 |-- summary: string (nullable = true)
 |-- id: string (nullable = true)
 |-- time: string (nullable = true)
 |-- value: string (nullable = true)



In [33]:
sdf.describe().show()

+-------+------------------+-------+-----+
|summary|                id|   time|value|
+-------+------------------+-------+-----+
|  count|                10|     10|   10|
|   mean|               4.5|   null|  NaN|
| stddev|3.0276503540974917|   null|  NaN|
|    min|                 0|0:00:00|  0.0|
|    max|                 9|9:00:00|  NaN|
+-------+------------------+-------+-----+



In [34]:
# Interpolation Function
def interpol(x, x_prev, x_next, y_prev, y_next, y):
    if x_prev == x_next:
        return y
    else:
        m = (y_next-y_prev)/(x_next-x_prev)
        y_interpol = y_prev + m * (x - x_prev)
        return y_interpol

In [37]:
# convert function to udf
import pyspark.sql.functions as func
from pyspark.sql.types import FloatType
interpol_udf = func.udf(interpol, FloatType())   

In [None]:
# add interpolated columns to dataframe and clean up
df_filled = df_filled.withColumn('readvalue_interpol', interpol_udf('readtime', 'readtime_ff', 'readtime_bf', 'readvalue_ff', 'readvalue_bf', 'readvalue'))\
                    .drop('readtime_existent', 'readtime_ff', 'readtime_bf')\
                    .withColumnRenamed('reads_all', 'readvalue')\
                    .withColumn('readtime', func.from_unixtime(col('readtime')))