In [None]:
# pandas_operations.py
import pandas as pd

# load dataset
df = pd.read_csv('superstore.csv')

# view structure
print(df.head())
print(df.shape)
print(df.dtypes)

# select specific columns
print(df[['customer', 'product', 'profit']])

# filter where profit > 2000 and discount = 0
filtered = df[(df['profit'] > 2000) & (df['discount'] == 0)]
print(filtered)

# sort by profit descending
sorted_df = df.sort_values(by='profit', ascending=False)
print(sorted_df)

# group by category for total profit and average discount
grouped = df.groupby('category').agg({'profit': 'sum', 'discount': 'mean'})
print(grouped)

# add totalprice column
df['totalprice'] = df['quantity'] * df['unitprice']

# drop subcategory column
df.drop(columns=['subcategory'], inplace=True)

# fill missing discount values
df['discount'] = df['discount'].fillna(0.10)

# apply classification based on profit
def classify(row):
    if row['profit'] > 4000:
        return 'high'
    elif row['profit'] > 0:
        return 'medium'
    else:
        return 'low'

df['profit_class'] = df.apply(classify, axis=1)

# final view
print(df)


In [None]:
# pyspark_operations.py 
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg, year, month
from pyspark.sql.types import DateType

# initialize spark session
spark = SparkSession.builder.appName("superstore").getOrCreate()

# load data
df_spark = spark.read.csv("superstore.csv", header=True, inferSchema=True)

# show schema and sample
df_spark.printSchema()
df_spark.show(5)

# select and rename columns
df_spark.select(col("customer").alias("client"), "product", "profit").show()

# filter by segment and profit
df_spark.filter((col("segment") == "Consumer") & (col("profit") < 1000)).show()

# region-wise average profit
df_spark.groupBy("region").agg(avg("profit").alias("avg_profit")).show()

# add totalprice column
df_spark = df_spark.withColumn("totalprice", col("quantity") * col("unitprice"))

# classify orders by profit
df_spark = df_spark.withColumn("profit_class",
    when(col("profit") > 2000, "high")
    .when(col("profit") <= 0, "loss")
    .otherwise("medium")
)

# drop subcategory column
df_spark = df_spark.drop("subcategory")

# fill null discount values
df_spark = df_spark.fillna({"discount": 0.10})

# convert orderdate to date and extract year, month
df_spark = df_spark.withColumn("orderdate", col("orderdate").cast(DateType()))
df_spark = df_spark.withColumn("order_year", year("orderdate"))
df_spark = df_spark.withColumn("order_month", month("orderdate"))

# final result
df_spark.show()


In [None]:
# dask_operations.py 
# install dask 
!pip install dask

import dask.dataframe as dd

# load dataset
ddf = dd.read_csv('superstore.csv')

# compute average discount by category
avg_discount = ddf.groupby('category')['discount'].mean().compute()
print(avg_discount)

# filter by quantity > 1 and high profit
filtered_ddf = ddf[(ddf['quantity'] > 1) & (ddf['profit'] > 2000)]

# save filtered result
filtered_ddf.compute().to_csv("filtered_superstore.csv", index=False)
print("filtered data saved to filtered_superstore.csv")


In [None]:
# json_reader.py
from pyspark.sql import SparkSession

# initialize spark session
spark = SparkSession.builder.appName("nested_json").getOrCreate()

# read nested json
df_json = spark.read.json("orders.json", multiLine=True)

# show schema and selected fields
df_json.printSchema()
df_json.select("orderid", "customer.name", "details.profit").show()
