# English SDK for Apache Spark

## Initialization

In [1]:
from pyspark_ai import SparkAI

spark_ai = SparkAI(verbose=True)
spark_ai.activate()  # active partial functions for Spark DataFrame

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/25 14:21:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Example 1: Auto sales by brand in US 2022

In [2]:
# Search and ingest web content into a DataFrame
# If you have set up google-api-python-client, you can just run
auto_df = spark_ai.create_df("2022 USA national auto sales by brand")
auto_df.show()

[92mINFO: [0mParsing URL: https://www.carpro.com/blog/full-year-2022-national-auto-sales-by-brand

[92mINFO: [0mSQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mspark_ai_temp_view_420000[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00m[34mVALUES[39;49;00m[37m[39;49;00m
([33m'Toyota'[39;49;00m,[37m [39;49;00m[34m1849751[39;49;00m,[37m [39;49;00m-[34m9[39;49;00m),[37m[39;49;00m
([33m'Ford'[39;49;00m,[37m [39;49;00m[34m1767439[39;49;00m,[37m [39;49;00m-[34m2[39;49;00m),[37m[39;49;00m
([33m'Chevrolet'[39;49;00m,[37m [39;49;00m[34m1502389[39;49;00m,[37m [39;49;00m[34m6[39;49;00m),[37m[39;49;00m
([33m'Honda'[39;49;00m,[37m [39;49;00m[34m881201[39;49;00m,[37m [39;49;00m-[34m33[39;49;00m),[37m[

In [6]:
auto_df.ai.verify("expect sales change percentage to be between -100 to 100")

[92mINFO: [0mLLM Output:
def check_sales_change_percentage(df) -> bool:
    from pyspark.sql.functions import col

    # Check if the Sales_Change_Percentage column values are between -100 and 100
    df_filtered = df.filter((col("Sales_Change_Percentage") >= -100) & (col("Sales_Change_Percentage") <= 100))

    # If the number of rows in the filtered DataFrame is equal to the number of rows in the original DataFrame, return True
    if df_filtered.count() == df.count():
        return True
    else:
        return False

result = check_sales_change_percentage(df)
[92mINFO: [0mGenerated code:
[34mdef[39;49;00m [32mcheck_sales_change_percentage[39;49;00m(df) -> [36mbool[39;49;00m:[37m[39;49;00m
    [34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctions[39;49;00m [34mimport[39;49;00m col[37m[39;49;00m
[37m[39;49;00m
    [37m# Check if the Sales_Change_Percentage column values are between

In [7]:
auto_df.ai.plot()

[92mINFO: [0mHere is the Python code to visualize the result of `df` using plotly:


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mgraph_objects[39;49;00m [34mas[39;49;00m [04m[36mgo[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m [34mimport[39;49;00m SparkSession[37m[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# Start Spark session[39;49;00m[37m[39;49;00m
spark = SparkSession.builder.appName([33m'[39;49;00m[33mexample[39;49;00m[33m'[39;49;00m).getOrCreate()[37m[39;49;00m
[37m[39;49;00m
[37m# Assuming df is a Spark DataFrame[39;49;00m[37m[39;49;00m
[37m# Convert Spark DataFrame to Pandas DataFrame[39;49;00m[37m[39;49;00m
pandas_df = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Create a bar chart for US_Sales_2022[39;49;00m[37m[39;49;

In [8]:
auto_df.ai.plot("pie chart for US sales market shares, show the top 5 brands and the sum of others")

[92mINFO: [0mHere is the Python code to visualize the result of `df` using plotly:


```
[34mimport[39;49;00m [04m[36mplotly[39;49;00m[04m[36m.[39;49;00m[04m[36mgraph_objects[39;49;00m [34mas[39;49;00m [04m[36mgo[39;49;00m[37m[39;49;00m
[34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m [34mimport[39;49;00m SparkSession[37m[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m[37m[39;49;00m
[37m[39;49;00m
[37m# Start Spark session[39;49;00m[37m[39;49;00m
spark = SparkSession.builder.getOrCreate()[37m[39;49;00m
[37m[39;49;00m
[37m# Assuming df is a Spark DataFrame[39;49;00m[37m[39;49;00m
df_pd = df.toPandas()[37m[39;49;00m
[37m[39;49;00m
[37m# Calculate the total sales[39;49;00m[37m[39;49;00m
total_sales = df_pd[[33m'[39;49;00m[33mUS_Sales_2022[39;49;00m[33m'[39;49;00m].sum()[37m[39;49;00m
[37m[39;49;00m
[37m# Calculate the sales perce

In [9]:
# Apply transforms to a Dataframe
auto_top_growth_df=auto_df.ai.transform("brand with the highest growth")
auto_top_growth_df.show()

[92mINFO: [0mCreating temp view for the transform:
df.createOrReplaceTempView([33m"[39;49;00m[33mspark_ai_temp_view_88e94e[39;49;00m[33m"[39;49;00m)[37m[39;49;00m

[92mINFO: [0mSQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00mBrand[37m [39;49;00m
[34mFROM[39;49;00m[37m [39;49;00mspark_ai_temp_view_88e94e[37m [39;49;00m
[34mORDER[39;49;00m[37m [39;49;00m[34mBY[39;49;00m[37m [39;49;00mSales_Change_Percentage[37m [39;49;00m[34mDESC[39;49;00m[37m [39;49;00m
[34mLIMIT[39;49;00m[37m [39;49;00m[34m1[39;49;00m[37m[39;49;00m

+-------+
|  Brand|
+-------+
|Genesis|
+-------+



In [10]:
# Explain what a DataFrame is retrieving.
auto_top_growth_df.ai.explain()

'In summary, this dataframe is retrieving the brand with the highest sales change percentage. It presents the results sorted by sales change percentage in descending order and limits the result to the top brand.'

## Example 2: USA Presidents

In [7]:
# You can also specify the expected columns for the ingestion.
df=spark_ai.create_df("USA presidents", ["president", "vice_president"])
df.show()

[92mINFO: [0mParsing URL: https://www.loc.gov/rr/print/list/057_chron.html

[92mINFO: [0mSQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mtemp_view_0451bc[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00m[34mVALUES[39;49;00m[37m[39;49;00m
([33m'George Washington'[39;49;00m,[37m [39;49;00m[33m'John Adams'[39;49;00m),[37m[39;49;00m
([33m'John Adams'[39;49;00m,[37m [39;49;00m[33m'Thomas Jefferson'[39;49;00m),[37m[39;49;00m
([33m'Thomas Jefferson'[39;49;00m,[37m [39;49;00m[33m'Aaron Burr'[39;49;00m),[37m[39;49;00m
([33m'Thomas Jefferson'[39;49;00m,[37m [39;49;00m[33m'George Clinton'[39;49;00m),[37m[39;49;00m
([33m'James Madison'[39;49;00m,[37m [39;49;00m[33m'George Clinton'[39;49;00m),[37m[39;49;00m

In [8]:
presidents_who_were_vp = df.ai.transform("presidents who were also vice presidents")
presidents_who_were_vp.show()

[92mINFO: [0mCreating temp view for the transform:
df.createOrReplaceTempView([33m"[39;49;00m[33mtemp_view_a6e278[39;49;00m[33m"[39;49;00m)[37m[39;49;00m

[92mINFO: [0mSQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00m[34mDISTINCT[39;49;00m[37m [39;49;00mpresident[37m [39;49;00m
[34mFROM[39;49;00m[37m [39;49;00mtemp_view_a6e278[37m [39;49;00m
[34mWHERE[39;49;00m[37m [39;49;00mpresident[37m [39;49;00m[34mIN[39;49;00m[37m [39;49;00m([34mSELECT[39;49;00m[37m [39;49;00mvice_president[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00mtemp_view_a6e278)[37m[39;49;00m

+------------------+
|         president|
+------------------+
|        John Adams|
|  Thomas Jefferson|
|  Martin Van Buren|
|  Millard Fillmore|
|        John Tyler|
|    Andrew Johnson|
| Chester A. Arthur|
|Theodore Roosevelt|
|   Calvin Coolidge|
|   Harry S. Truman|
|    Gerald R. Ford|
| Lyndon B. Johnson|
|  Richard M. Nixon|
|       George Bush|
|   Joseph R. 

In [9]:
presidents_who_were_vp.ai.explain()

'In summary, this dataframe is retrieving the distinct presidents who have also served as vice presidents. The data is sourced from a temporary view named `temp_view_a6e278`.'

In [10]:
presidents_who_were_vp.ai.verify("expect no NULL values")

[92mINFO: [0mLLM Output:
def has_no_nulls(df) -> bool:
    from pyspark.sql.functions import col, sum as _sum

    # Check if any column has null values
    for column in df.columns:
        nulls = df.where(col(column).isNull()).count()
        if nulls > 0:
            return False
    return True

result = has_no_nulls(df)
[92mINFO: [0mGenerated code:
[34mdef[39;49;00m [32mhas_no_nulls[39;49;00m(df) -> [36mbool[39;49;00m:[37m[39;49;00m
    [34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctions[39;49;00m [34mimport[39;49;00m col, [36msum[39;49;00m [34mas[39;49;00m _sum[37m[39;49;00m
[37m[39;49;00m
    [37m# Check if any column has null values[39;49;00m[37m[39;49;00m
    [34mfor[39;49;00m column [35min[39;49;00m df.columns:[37m[39;49;00m
        nulls = df.where(col(column).isNull()).count()[37m[39;49;00m
        [34mif[39;49;00m nulls > [34m0[39;49;00m:[37m[39;49

# Example 3: Top 10 tech companies

In [11]:
# Search and ingest web content into a DataFrame
company_df=spark_ai.create_df("Top 10 tech companies by market cap", ['company', 'cap', 'country'])
company_df.show()

[92mINFO: [0mParsing URL: https://www.statista.com/statistics/1350976/leading-tech-companies-worldwide-by-market-cap/

[92mINFO: [0mSQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mtemp_view_a58845[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m
[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00m[34mVALUES[39;49;00m[37m[39;49;00m
([33m'Apple'[39;49;00m,[37m [39;49;00m[34m2242[39;49;00m,[37m [39;49;00m[33m'USA'[39;49;00m),[37m[39;49;00m
([33m'Microsoft'[39;49;00m,[37m [39;49;00m[34m1821[39;49;00m,[37m [39;49;00m[33m'USA'[39;49;00m),[37m[39;49;00m
([33m'Alphabet (Google)'[39;49;00m,[37m [39;49;00m[34m1229[39;49;00m,[37m [39;49;00m[33m'USA'[39;49;00m),[37m[39;49;00m
([33m'Amazon'[39;49;00m,[37m [39;49;00m[34m902[39;49;00m.[34m4[39;49;00m,[37

In [17]:
us_company_df=company_df.ai.transform("companies in USA")
us_company_df.show()

[92mINFO: [0mCreating temp view for the transform:
df.createOrReplaceTempView([33m"[39;49;00m[33mtemp_view_c4176b[39;49;00m[33m"[39;49;00m)[37m[39;49;00m

[92mINFO: [0mSQL query for the transform:
[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00mtemp_view_c4176b[37m [39;49;00m[34mWHERE[39;49;00m[37m [39;49;00mcountry[37m [39;49;00m=[37m [39;49;00m[33m'USA'[39;49;00m[37m[39;49;00m

+--------------------+------+-------+
|             company|   cap|country|
+--------------------+------+-------+
|               Apple|2242.0|    USA|
|           Microsoft|1821.0|    USA|
|   Alphabet (Google)|1229.0|    USA|
|              Amazon| 902.4|    USA|
|               Tesla| 541.4|    USA|
|              NVIDIA| 401.7|    USA|
|Meta Platforms (F...| 302.1|    USA|
+--------------------+------+-------+



In [13]:
us_company_df.ai.explain()

'In summary, this dataframe is retrieving the company, capitalization, and country information for companies located in the United States from a temporary view named `temp_view_826eb9`.'

In [14]:
us_company_df.ai.verify("expect all company names to be unique")

[92mINFO: [0mLLM Output:
def has_unique_company_names(df) -> bool:
    from pyspark.sql import functions as F

    # Count the number of unique company names
    unique_company_names = df.select(F.countDistinct("company")).collect()[0][0]

    # Check if the number of unique company names is equal to the total number of rows
    if unique_company_names == df.count():
        return True
    else:
        return False

result = has_unique_company_names(df)
[92mINFO: [0mGenerated code:
[34mdef[39;49;00m [32mhas_unique_company_names[39;49;00m(df) -> [36mbool[39;49;00m:[37m[39;49;00m
    [34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m [34mimport[39;49;00m functions [34mas[39;49;00m F[37m[39;49;00m
[37m[39;49;00m
    [37m# Count the number of unique company names[39;49;00m[37m[39;49;00m
    unique_company_names = df.select(F.countDistinct([33m"[39;49;00m[33mcompany[39;49;00m[33m"[39;49;00m)).collect()[[34m0[39;4

## Example 4: Ingestion from a URL
Instead of searching for the web page, you can also ask the SparkAI to ingest from a URL.

In [15]:
best_albums_df = spark_ai.create_df('https://time.com/6235186/best-albums-2022/', ["album", "artist", "year"])
best_albums_df.show()

[92mINFO: [0mParsing URL: https://time.com/6235186/best-albums-2022/

[92mINFO: [0mSQL query for the ingestion:
[34mCREATE[39;49;00m[37m [39;49;00m[34mOR[39;49;00m[37m [39;49;00m[34mREPLACE[39;49;00m[37m [39;49;00mTEMP[37m [39;49;00m[34mVIEW[39;49;00m[37m [39;49;00mtemp_view_854ff1[37m [39;49;00m[34mAS[39;49;00m[37m [39;49;00m[34mSELECT[39;49;00m[37m [39;49;00m*[37m [39;49;00m[34mFROM[39;49;00m[37m [39;49;00m[34mVALUES[39;49;00m[37m[39;49;00m
([33m'Motomami'[39;49;00m,[37m [39;49;00m[33m'Rosalía'[39;49;00m,[37m [39;49;00m[34m2022[39;49;00m),[37m[39;49;00m
([33m'You Can’t Kill Me'[39;49;00m,[37m [39;49;00m[33m'070 Shake'[39;49;00m,[37m [39;49;00m[34m2022[39;49;00m),[37m[39;49;00m
([33m'Mr. Morale & The Big Steppers'[39;49;00m,[37m [39;49;00m[33m'Kendrick Lamar'[39;49;00m,[37m [39;49;00m[34m2022[39;49;00m),[37m[39;49;00m
([33m'Big Time'[39;49;00m,[37m [39;49;00m[33m'Angel Olsen'[39;49;00m,[37m [39;49;00

In [16]:
best_albums_df.ai.verify("expect each year to be 2022")

[92mINFO: [0mLLM Output:
Here is your output:

def check_year(df) -> bool:
    from pyspark.sql.functions import col
    # Check if all years are 2022
    if df.filter(col('year') != 2022).count() == 0:
        return True
    else:
        return False

result = check_year(df)
[92mINFO: [0mGenerated code:
Here [35mis[39;49;00m your output:[37m[39;49;00m
[37m[39;49;00m
[34mdef[39;49;00m [32mcheck_year[39;49;00m(df) -> [36mbool[39;49;00m:[37m[39;49;00m
    [34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctions[39;49;00m [34mimport[39;49;00m col[37m[39;49;00m
    [37m# Check if all years are 2022[39;49;00m[37m[39;49;00m
    [34mif[39;49;00m df.filter(col([33m'[39;49;00m[33myear[39;49;00m[33m'[39;49;00m) != [34m2022[39;49;00m).count() == [34m0[39;49;00m:[37m[39;49;00m
        [34mreturn[39;49;00m [34mTrue[39;49;00m[37m[39;49;00m
    [34melse[39;49;00m:[37m[3

Exception: ('Could not evaluate Python code', SyntaxError('invalid syntax', ('verify_df-CodeGen', 1, 14, 'Here is your output:\n')))

## Example 5: UDF Generation

You can also ask the SparkAI to generate code for a Spark UDF by providing.

In [12]:
@spark_ai.udf
def convert_grades(grade_percent: float) -> str:
    """Convert the grade percent to a letter grade using standard cutoffs"""
    ...

[92mINFO: [0mCreating following Python UDF:
[34mdef[39;49;00m [32mconvert_grades[39;49;00m(grade_percent) -> [36mstr[39;49;00m:[37m[39;49;00m
    [34mif[39;49;00m grade_percent [35mis[39;49;00m [35mnot[39;49;00m [34mNone[39;49;00m:[37m[39;49;00m
        [34mif[39;49;00m grade_percent >= [34m90.0[39;49;00m:[37m[39;49;00m
            [34mreturn[39;49;00m [33m'[39;49;00m[33mA[39;49;00m[33m'[39;49;00m[37m[39;49;00m
        [34melif[39;49;00m grade_percent >= [34m80.0[39;49;00m:[37m[39;49;00m
            [34mreturn[39;49;00m [33m'[39;49;00m[33mB[39;49;00m[33m'[39;49;00m[37m[39;49;00m
        [34melif[39;49;00m grade_percent >= [34m70.0[39;49;00m:[37m[39;49;00m
            [34mreturn[39;49;00m [33m'[39;49;00m[33mC[39;49;00m[33m'[39;49;00m[37m[39;49;00m
        [34melif[39;49;00m grade_percent >= [34m60.0[39;49;00m:[37m[39;49;00m
            [34mreturn[39;49;00m [33m'[39;49;00m[33mD[39;49;00m[33m'[39;49;00m[37m

In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.udf.register("convert_grades", convert_grades)
percentGrades = [(1, 97.8), (2, 72.3), (3, 81.2)]
df = spark.createDataFrame(percentGrades, ["student_id", "grade_percent"])
df.selectExpr("student_id", "convert_grades(grade_percent)").show()

[Stage 13:>                                                         (0 + 1) / 1]

+----------+-----------------------------+
|student_id|convert_grades(grade_percent)|
+----------+-----------------------------+
|         1|                            A|
|         2|                            C|
|         3|                            B|
+----------+-----------------------------+



                                                                                

# Cache
The SparkAI supports a simple in-memory and persistent cache system. It keeps an in-memory staging cache, which gets updated for LLM and web search results. The staging cache can be persisted through the commit() method. Cache lookup is always performed on both in-memory staging cache and persistent cache.

In [14]:
spark_ai.commit()