# [PySpark AI](https://github.com/databrickslabs/pyspark-ai)

```shell
mamba install -c plotly plotly
```

or

```
pip install pyspark-ai plotly-express
```

Clean data rows using:

```
tr -d '\r' < all_perth_310121.csv > all_perth_310121_new.csv
```

In [None]:
# import geofunctions as S
import pyspark.sql.functions as F
from langchain.chat_models import ChatOpenAI
from pyspark_ai import SparkAI
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("SparkAI").getOrCreate()

In [None]:
# S.st_register_functions()

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
spark_ai = SparkAI(llm, verbose=True)
spark_ai.activate()

In [None]:
schema = ",".join(
    [
        "`ADDRESS` string",
        "`SUBURB` string",
        "`PRICE` double",
        "`BEDROOMS` integer",
        "`BATHROOMS` integer",
        "`GARAGE` integer",
        "`LAND_AREA` double",
        "`FLOOR_AREA` double",
        "`BUILD_YEAR` int",
        "`CBD_DIST` double",
        "`NEAREST_STN` string",
        "`NEAREST_STN_DIST` string",
        "`DATE_SOLD` string",
        "`POSTCODE` string",
        "`LATITUDE` double",
        "`LONGITUDE` double",
        "`NEAREST_SCH` string",
        "`NEAREST_SCH_DIST` double",
        "`NEAREST_SCH_RANK` int",
    ]
)

In [None]:
df = spark.read.csv("../data/all_perth_310121_new.csv", header=True, schema=schema).cache()

In [None]:
df.count()

In [None]:
spark_ai.plot_df(
    df, "show distribution of NEAREST_SCH_DIST less than 8 miles in 32 bins"
)

In [None]:
df2 = spark_ai.transform_df(df, "with 3 bedrooms and 2 garages")

In [None]:
spark_ai.plot_df(
    df2,
    "show map of all locations using LONGITUDE and LATITUDE with only address column",
)

In [None]:
# spark_ai.transform_df(
#     df,
#     "Create regression model to calculate the price of the house using BEDROOMS,BATHROOMS and LAND_AREA",
# )