# Challenges

1. Read in JSON data and write to Parquet format
2. Join two datasets then perform group_by aggregation
3. Select subset of columns, filter rows, and add new column
4. Build expression to find correlations between columns
5. Use Polars for analysis that exceeds Pandas capabilities


In [None]:
# Example of a polar 

import polars as pl

# Create DataFrame 
data = [{"fruit": "apple", "count": 10, "price": 0.50}, 
        {"fruit": "banana", "count": 20, "price": 0.25}]
df = pl.from_dicts(data)

# Expressions to select, filter, aggregate
sel = df.select(["fruit", "count"]) # Select columns
filt = sel.filter(pl.col("fruit") == "apple") # Filter rows
agg = filt.group_by("fruit").agg(pl.col("count").sum()) # Aggregate

print(agg)

In [None]:
# Create a JSON data
import ipynb.fs.full.Create_data_helper as cdh

# Create a CSV and JSON file for storing weather data
cdh.create_random_weather_data("weather_data")

In [None]:
# 1. Read in JSON data and write to Parquet format

import polars as pl

weather_data = pl.read_json("weather_data.json")
weather_data.write_parquet("weather_data.parquet")
weather_data

In [None]:
import polars as pl

# read weather JSON data into polars DataFrame
df = pl.read_json("weather_data.json")

# Create data set1 that filter rows based on condition
max_temp = df.filter(pl.col("Temperature") > pl.col("Temperature").mean())
print(max_temp)

# Create data set2 that filter rows based on condition
sunny = df.filter(pl.col("Weather Condition") == "sunny")
print(sunny)

# 2. Join two datasets then perform group_by aggregation
jd = max_temp.join(sunny, on="Temperature", how="inner")
print(jd)

agg = jd.group_by("Weather Condition").agg(pl.col("Temperature"))
print(agg)

In [None]:
import polars as pl

# read weather JSON data into polars DataFrame
df = pl.read_json("weather_data.json")

# 3. Select subset of columns, filter rows, and add new column
df1 = df.select(["Temperature", "Humidity"]).filter(pl.col("Temperature") > pl.col("Temperature").mean())
df1 = df1.rename({"Temperature": "Tempeature_in_C"})
df2 = df1.with_columns(Tempeature_in_F = ((pl.col("Tempeature_in_C") * 1.8) + 32))

print(df1)
print(df2)

In [None]:
# 4. Build expression to find correlations between columns

import polars as pl

df = pl.read_json("weather_data.json")

corr = df.select(pl.corr("Temperature", "Humidity"))
print(corr)

In [None]:
import polars as pl

# Sample DataFrame
data = {"col1": [1, 2, 3, 4, 5], "col2": [2, 4, 5, 4, 5]}
df = pl.DataFrame(data)

# Calculate Pearson correlation
correlation = df.select(pl.corr("col1", "col2", method="pearson"))
print(correlation)