# SPARK SCENARIOS 

In [1]:
from pyspark.sql import SparkSession,DataFrame
import pyspark.sql.functions as F

In [2]:
spark=SparkSession.builder.\
appName("spark_scenarios").\
getOrCreate()

In [3]:
# %config Completer.use_jedi = False
# !pip install --upgrade jedi==0.17.2

1. Recursively read file from folders

a. Using recursiveFileLookup option in spark 3.0 +

In [4]:
spark.read.options(recursiveFileLookup=True,inferSchema=True,delimiter="\t").\
csv("/user/itv452844/test_data/recursive_read/hr_db").count()

25

In [5]:
# ! hadoop fs -cat /user/itv452844/test_data/recursive_read/hr_db/part1/part-m-00000

2. Replace Values from an arrayType Column

In [6]:
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,ArrayType,IntegerType,StringType

* create test dataframe with arrayType column

In [7]:
schema=StructType([StructField('id',IntegerType()),StructField('val',ArrayType(StringType()))])

row={'id':1,'val':['null','2','null','2','4']}

df=spark.createDataFrame([Row(**row)],schema=schema)

df.toPandas() # input dataframe

Unnamed: 0,id,val
0,1,"[null, 2, null, 2, 4]"


In [8]:
df.createOrReplaceTempView("dfTable") # regestering as a temp view for querying

In [9]:
# query for replacing elements from a arrayType Column
qry="""
select 
id,
split(replace(concat_ws(',',val),'null',''),',') as val
from
dfTable
"""
spark.sql(qry)

id,val
1,"[, 2, , 2, 4]"


3. remove columns with all values as null

In [14]:
schema=StructType([StructField('col1',IntegerType()),StructField('col2',IntegerType()),StructField('col3',IntegerType())])
data=[Row(**{'col1':1,'col2':None,'col3':1}),
Row(**{'col1':1,'col2':None,'col3':1}),
Row(**{'col1':1,'col2':None,'col3':None}),
Row(**{'col1':1,'col2':None,'col3':1})]

In [16]:
df=spark.createDataFrame(data,schema=schema)

In [21]:
df.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
|   1|null|   1|
|   1|null|   1|
|   1|null|null|
|   1|null|   1|
+----+----+----+



In [53]:
def getDropList(df):
    
    colsToDrop=df.rdd.\
    map(lambda x : x.asDict()).\
    flatMap(lambda x : list(x.items())).\
    groupByKey().\
    mapValues(set).\
    filter(lambda x : list(x[1])[0]==None and len(x[1])==1).\
    collect()
    
    if len(colsToDrop)>0:
        return [x[0] for x in colsToDrop]
        

In [55]:
df.drop(*getDropList(df))

col1,col3
1,1.0
1,1.0
1,
1,1.0


2