In [0]:
from pyspark.sql.functions import regexp_replace

data = [("id1", "Product A!@#"), ("id2", "Item B$%^"), ("id3", "Value C&*-")]
df = spark.createDataFrame(data, ["id", "product_name"])

df.show()


In [0]:

# Replace special characters in 'product_name' column with an empty string
df_cleaned = df.withColumn("product_name_cleaned", regexp_replace("product_name", "[^a-zA-Z]", ""))

df_cleaned.show()

In [0]:
data = [(1, 100), (2,200), (3,300),(4,400),(5,500)]
df = spark.createDataFrame(data, ["id", "salary"])
df.show()

In [0]:
spark.sql("select * from {df} order by salary desc limit 1 offset 0",df=df).show()

In [0]:
data = [(1,"ravi",2), (2, "kavi",3), (3, "hari",4),(4, "giri",2),(5, "siri",3),(6, "pari",2)]
df = spark.createDataFrame(data, ["id", "name","mgr"])
df.show()

In [0]:
df.createOrReplaceTempView("emp")

In [0]:
%sql
select e.id, e.name, e.mgr, mgr.id, mgr.name, mgr.mgr , srmgr.id ,srmgr.name
from emp e 
left join emp mgr on e.mgr = mgr.id
left join emp srmgr on  mgr.mgr = srmgr.id

In [0]:
%sql 

select e2.id,e2.name from emp e1 
join emp e2 on e1.mgr = e2.id


In [0]:
%sql
with cte as(
select mgr, count(1) as cnt from emp group by mgr
)
select e.name, cnt from cte 
join emp e on cte.mgr = e.id



In [0]:
data = [(1,2), (2, 3), (3,4),(3,2),(2,1),(4,1)]
pdf = spark.createDataFrame(data, ["p1", "p2"])
pdf.show()

In [0]:
df.createOrReplaceTempView("person")

In [0]:
%sql 

select least(p1,p2) as frm, greatest(p1,p2) as gtr, count(1) from person group by 1,2

In [0]:
from collections import Counter

# Counting elements in a list
data = ['apple', 'banana', 'apple', 'orange', 'banana', 'apple']
counts = Counter(data)
print(f"Counts from list: {counts}")

x = [k for k, v in counts.items() if v> 1]
print(x)

In [0]:

data = [("id1", "Product A!@#"), ("id2", "Item B$%^"), ("id3", "Value C&*-"), ("id3", "clean")]
df = spark.createDataFrame(data, ["id", "product_name"])

df.show()

In [0]:
from pyspark.sql.functions import *
special_char_pattern = r"[!@#$%^&*-]"
#df.withColumn('new', length(df.product_name))
df.filter( (col('product_name').rlike(special_char_pattern)) | (length(df.product_name)> lit(5))).show()

In [0]:
import pandas as pd

balance_df = spark.createDataFrame(pd.DataFrame({
    'CustId': ['C1', 'C2', 'C3', 'C4'],
    'PolicyId': ['P1', 'P2', 'P3', 'P4'],
    'ClaimId': [None, None, None, None],
    'BalancePolicyAmount': [5000000, 500000, 5000000, 5000000],
    'Date': ['10/10/2020', '10/10/2020', '10/10/2020', '10/10/2020']
}))

In [0]:
policy_df = spark.createDataFrame(pd.DataFrame({
    'PolicyId': ['P1', 'P2', 'P3', 'P4', 'P5'],
    'ActiveDate': ['10/10/2020', '10/10/2020', '12/10/2020', '13/10/2020', '14/10/2020'],
    'ActiveStatus': ['Y', 'Y', 'N', 'Y', 'Y'],
    'PolicyAmount': [5000000, 500000, 5000000, 5000000, 5000000]
}))

In [0]:
claims_df = spark.createDataFrame(pd.DataFrame({
    'ClaimId': ['Claim1', 'Claim1'],
    'ClaimDate': ['11/10/2021', '11/10/2021'],
    'ClaimAmount': [100000, 200000],
    'ClaimAgainstPolicy': ['P1', 'P3']
}))

In [0]:
policy_df.show()
balance_df.show()
claims_df.show()

policy_df.createOrReplaceTempView("policy")
balance_df.createOrReplaceTempView("balance")
claims_df.createOrReplaceTempView("claims")

In [0]:
%sql
with balance as (
select b.PolicyId, (b.balancePolicyAmount-c.ClaimAmount) as current_balance 
from balance b 
join claims c 
on c.claimAgainstPolicy = b.PolicyId 
)
merge into erm.balance 
using balance b 
on b.PolicyId = balance.PolicyId
when matched then update set b.BalancePolicyAmount = b.current_balance