In [0]:
sc.version

In [1]:
%pyspark
from pyspark.sql.functions import col

df = spark.read.json(sc.parallelize([
  """{"properties": { "prop1": "foo", "prop2": "bar", "prop3": true, "prop4": 1}}"""]
))

df.select(col("properties.*")).printSchema()

In [2]:
%pyspark
from pyspark.sql.functions import expr


habrData = spark.read\
    .option("header", True)\
    .option("inferSchema", True)\
    .csv("/user/admin/habr_data.csv")\
    .withColumn("words_count", expr("CAST(words_count as INTEGER)"))\
    .cache()

habrData.printSchema()

In [3]:
%pyspark
z.show(habrData.where("words_count is null"))

In [4]:
%pyspark
z.show(habrData)

In [5]:
%pyspark

from pyspark.sql.functions import regexp_replace, col

habrData\
.select("link")\
.withColumn("company_id", regexp_replace(col("link"), "(https://habr.com/ru/company/)|(/blog/[0-9]+/)|(https://habr.com/ru/post/[0-9]+/)", ""))\
.show(50, False)

In [6]:
%pyspark

habrData = spark.read.option("header", True).option("inferSchema", True).csv("/user/admin/habr_data.csv").cache()

habrData.printSchema()

In [7]:
%pyspark

habrData = spark.read.option("header", True).csv("/user/admin/habr_data.csv").cache()
 
from pyspark.sql.functions import udf, col, round
from pyspark.sql.types import IntegerType, FloatType

def mult(i):
    return i * 2
    
multUdf = udf(mult)

habrData\
.select("rating")\
.limit(10)\
.withColumn("udfString", multUdf(col("rating")))\
.withColumn("udfInt", multUdf(col("rating").cast(IntegerType())))\
.withColumn("round", round(col("udfInt")) )\
.show()

In [8]:
%pyspark

spark.createDataFrame(([1], [2], [3]), schema="n INT").show()

In [9]:
%pyspark

spark.createDataFrame(([1], [2], [3]), schema="n INT")\
.selectExpr("n + 1")\
.explain()


In [10]:
%pyspark

from pyspark.sql.functions import udf, col

def plusOne(i):
    return i + 1
    
plusOneUdf = udf(plusOne)

spark.createDataFrame(([1], [2], [3]), schema="n INT")\
.select(plusOneUdf(col("n")))\
.explain()


In [11]:
%pyspark

spark.createDataFrame(([1], [2], [3]), schema="n INT")\
.selectExpr("n + 1 as plusOne")\
.where("plusOne = 2")\
.explain()

In [12]:
%pyspark

spark.createDataFrame(([1], [2], [3]), schema="n INT")\
.withColumn("plusOne", plusOneUdf(col("n")))\
.where("plusOne = 2")\
.explain()

In [13]:
%pyspark

habrData = spark.read.option("header", True).csv("/user/admin/habr_data.csv").cache()
 
from pyspark.sql.functions import udf, col, when, expr
from pyspark.sql.types import IntegerType

def mult(i):
    return i * 2
    
def mult_nullsafe(i):
    if i is None:
        return 0
    else:
        return i * 2
    
multUdf = udf(mult) 
# multUdf = udf(mult_nullsafe)

#.na.drop("all")\
habrData\
.select("rating")\
.withColumn("rating", col("rating").cast(IntegerType()))\
.withColumn("udfInt", multUdf(col("rating"))   )\
.show()

In [14]:
%pyspark

from pyspark.sql.functions import udf, col


spark.udf.register("mult_nullsafe", mult_nullsafe)

habrData\
.where("rating is not null")\
.withColumn("rating", col("rating").cast(IntegerType()))\
.createOrReplaceTempView("habr_data")

spark.sql("select rating, mult_nullsafe(rating) plus_one from habr_data").show()

In [15]:
%pyspark

# Integer type output
from pyspark.sql.types import IntegerType

def square(i):
    if i is None:
        return 0
    else:
        return i * i
    
square_udf_int = udf(lambda z: square(z), IntegerType())

habrData\
.select("rating")\
.where("rating is not null")\
.withColumn("rating", col("rating").cast(IntegerType()))\
.withColumn("square", square_udf_int(col("rating"))   )\
.show()

In [16]:
%pyspark
from pyspark.sql.types import *

    
def square_list_float(x):
    return [float(val)**2 for val in x]


square_list_float_udf = udf(lambda y: square_list_float(y), ArrayType(FloatType()))


cSchema = StructType([StructField("int_array", ArrayType(IntegerType()))])

df = spark.createDataFrame(
    [[[1, 2]], [[3, 4, 5]], [[6, 7, 8, 9]]], schema=cSchema
)

df.printSchema()
df.show()

df\
.withColumn("square_list_float_udf", square_list_float_udf("int_array"))\
.show()


In [17]:
%pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

simpleData = [\
    ("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

simpleData2 = [\
    ("James","Sales","NY",90000,34,10000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns2= ["employee_name","department","state","salary","age","bonus"]

df2 = spark.createDataFrame(data = simpleData2, schema = columns2)

df2.printSchema()
df2.show(truncate=False)

unionDF = df.union(df2)
unionDF.show(truncate=False)
disDF = df.union(df2).distinct()
disDF.show(truncate=False)

unionAllDF = df.unionAll(df2)
unionAllDF.show(truncate=False)

In [18]:
%pyspark
simpleData3 = [\
    ("Sales","Jones","NY",90000,34,10000), \
    ("Finance","Maria","CA",90000,24,23000), \
    ("Finance","Jen","NY",79000,53,15000), \
  ]
columns3= ["department", "employee_name","state","salary","age","bonus"]

df3 = spark.createDataFrame(data = simpleData3, schema = columns3)

print("df:")
df.show()
print("df3:")
df3.show()

unionDF = df.union(df3)
unionDF.show(truncate=False)

In [19]:
%pyspark

df.union(
    df3.select(df.columns)
).show(truncate=False)

In [20]:
%pyspark

arrayArrayData = [
  ("James",[["Java","Scala","C++"],["Spark","Java"]]),
  ("Michael",[["Spark","Java","C++"],["Spark","Java"]]),
  ("Robert",[["CSharp","VB"],["Spark","Python"]])
]

df = spark.createDataFrame(data=arrayArrayData, schema = ['name','subjects'])
df.printSchema()
df.show(truncate=False)

from pyspark.sql.functions import explode
df2 = df.select(df.name, explode(df.subjects).alias("exploded"))
df2.show()

df2.select(df.name,explode(df2.exploded)).show(truncate=False)

In [21]:
%pyspark
df.schema.json()

In [22]:
%pyspark

arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})
        ]

df = spark.createDataFrame(data=arrayData, schema = ['name', 'knownLanguages', 'properties'])
df.printSchema()
df.show(truncate=False)

from pyspark.sql.functions import explode
df3 = df.select(df.name,explode(df.properties))
df3.printSchema()
df3.show()

In [23]:
%pyspark
from pyspark.sql.functions import explode_outer
""" with array """
df.select(df.name,explode_outer(df.knownLanguages)).show()
""" with map """
df.select(df.name,explode_outer(df.properties)).show()

#Window functions

In [25]:
%pyspark

simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) \
  )
 
columns= ["employee_name", "department", "salary"]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.printSchema()
df.show(truncate=False)

In [26]:
%pyspark

from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, rank, dense_rank, percent_rank, ntile, cume_dist, round
from pyspark.sql.types import FloatType

windowSpec = Window.partitionBy("department").orderBy("salary")

df\
    .withColumn("row_number", row_number().over(windowSpec)) \
    .withColumn("rank", rank().over(windowSpec)) \
    .withColumn("dense_rank", dense_rank().over(windowSpec)) \
    .withColumn("percent_rank", percent_rank().over(windowSpec)) \
    .withColumn("ntile", ntile(3).over(windowSpec)) \
    .withColumn("cume_dist", round(cume_dist().over(windowSpec), 2 )) \
    .show(truncate=False)


In [27]:
%pyspark

from pyspark.sql.functions import lag, lead

df.withColumn("lag",lag("salary",2).over(windowSpec)) \
.withColumn("lead",lead("salary",2).over(windowSpec)) \
      .show()

In [28]:
%pyspark

windowSpecAgg  = Window.partitionBy("department")

from pyspark.sql.functions import col,avg,sum,min,max,row_number

df.withColumn("row",row_number().over(windowSpec)) \
.withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
.withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
.withColumn("min", min(col("salary")).over(windowSpecAgg)) \
.withColumn("max", max(col("salary")).over(windowSpecAgg)) \
.where(col("row")==1).select("department","avg","sum","min","max") \
.show()


In [29]:
%pyspark

from pyspark.sql.functions import udf
from pyspark.sql.types import LongType

def usd2rub(usd_price):
  return round(usd_price * 50, 2)
  
usd2rub_udf = udf(usd2rub)

retailData.where("UnitPrice is not null")\
.withColumn("price_rub", usd2rub_udf("UnitPrice")).show()

* по данным habr_data получить таблицу с названиями топ-3 статей (по rating) для каждого автора
* по данным habr_data получить топ (по встречаемости) английских слов из заголовков. Возможное решение: 1) выделение слов с помощью регулярных выражений, 2) разделение на массивы слов 3) explode массивовов 4) группировка с подсчетом встречаемости