In [0]:
import pandas as pd

# CSV Data
csv_data = """product_id,product_name,category,price,quantity
101,Laptop,Electronics,55000,10
102,Smartphone,Electronics,30000,25
103,Chair,Furniture,2500,50
104,Book,Stationery,400,200
105,Headphones,Electronics,1500,100
106,Table,Furniture,3200,40
107,Pen,Stationery,20,500
108,Monitor,Electronics,12000,15
109,Notebook,Stationery,60,300
110,Sofa,Furniture,45000,5"""

# Read CSV into DataFrame
from io import StringIO
csv_file = StringIO(csv_data)
df_csv = pd.read_csv(csv_file)

# Print schema
print(df_csv.dtypes)


product_id       int64
product_name    object
category        object
price            int64
quantity         int64
dtype: object


In [0]:
# JSON Data
json_data = [
    {"product_id": 101, "product_name": "Laptop", "category": "Electronics", "price": 55000, "quantity": 10},
    {"product_id": 102, "product_name": "Smartphone", "category": "Electronics", "price": 30000, "quantity": 25},
    {"product_id": 103, "product_name": "Chair", "category": "Furniture", "price": 2500, "quantity": 50},
    {"product_id": 104, "product_name": "Book", "category": "Stationery", "price": 400, "quantity": 200},
    {"product_id": 105, "product_name": "Headphones", "category": "Electronics", "price": 1500, "quantity": 100},
    {"product_id": 106, "product_name": "Table", "category": "Furniture", "price": 3200, "quantity": 40},
    {"product_id": 107, "product_name": "Pen", "category": "Stationery", "price": 20, "quantity": 500},
    {"product_id": 108, "product_name": "Monitor", "category": "Electronics", "price": 12000, "quantity": 15},
    {"product_id": 109, "product_name": "Notebook", "category": "Stationery", "price": 60, "quantity": 300},
    {"product_id": 110, "product_name": "Sofa", "category": "Furniture", "price": 45000, "quantity": 5}
]

# Load JSON data into DataFrame
df_json = pd.DataFrame(json_data)

# Print schema
print(df_json.dtypes)

# Compare schemas between CSV and JSON
csv_schema = df_csv.dtypes
json_schema = df_json.dtypes

print("CSV Schema:\n", csv_schema)
print("JSON Schema:\n", json_schema)


product_id       int64
product_name    object
category        object
price            int64
quantity         int64
dtype: object
CSV Schema:
 product_id       int64
product_name    object
category        object
price            int64
quantity         int64
dtype: object
JSON Schema:
 product_id       int64
product_name    object
category        object
price            int64
quantity         int64
dtype: object


In [0]:
# Convert CSV data to Parquet format
import pyarrow as pa
import pyarrow.parquet as pq

# Convert to PyArrow Table
table = pa.Table.from_pandas(df_csv)

# Write to Parquet file
pq.write_table(table, 'products.parquet')


In [0]:
import os
import json
# Save CSV and JSON to disk for size comparison
df_csv.to_csv('products.csv', index=False)

# Save JSON to disk
with open('products.json', 'w') as json_file:
    json.dump(json_data, json_file)

# Get the file sizes
csv_size = os.path.getsize('products.csv')
json_size = os.path.getsize('products.json')
parquet_size = os.path.getsize('products.parquet')

print(f"CSV Size: {csv_size / 1024} KB")
print(f"JSON Size: {json_size / 1024} KB")
print(f"Parquet Size: {parquet_size / 1024} KB")


CSV Size: 0.3447265625 KB
JSON Size: 1.0205078125 KB
Parquet Size: 4.4130859375 KB


In [0]:
# Add a column 'total_revenue'
df_csv['total_revenue'] = df_csv['price'] * df_csv['quantity']

# Show the updated DataFrame
print(df_csv.head())


   product_id product_name     category  price  quantity  total_revenue
0         101       Laptop  Electronics  55000        10         550000
1         102   Smartphone  Electronics  30000        25         750000
2         103        Chair    Furniture   2500        50         125000
3         104         Book   Stationery    400       200          80000
4         105   Headphones  Electronics   1500       100         150000


In [0]:
# Sort by total_revenue and get the top 3
top_3_products = df_csv.sort_values(by='total_revenue', ascending=False).head(3)
print(top_3_products)


   product_id product_name     category  price  quantity  total_revenue
1         102   Smartphone  Electronics  30000        25         750000
0         101       Laptop  Electronics  55000        10         550000
9         110         Sofa    Furniture  45000         5         225000


In [0]:
# Filter for Furniture products with price > 3000
furniture_filtered = df_csv[(df_csv['category'] == 'Furniture') & (df_csv['price'] > 3000)]
print(furniture_filtered)


   product_id product_name   category  price  quantity  total_revenue
5         106        Table  Furniture   3200        40         128000
9         110         Sofa  Furniture  45000         5         225000


In [0]:
# Add a 'price_band' column based on price ranges
def assign_price_band(price):
    if price > 10000:
        return 'High'
    elif price > 3000:
        return 'Medium'
    else:
        return 'Low'

df_csv['price_band'] = df_csv['price'].apply(assign_price_band)

# Show the DataFrame
print(df_csv.head())


   product_id product_name     category  ...  quantity  total_revenue  price_band
0         101       Laptop  Electronics  ...        10         550000        High
1         102   Smartphone  Electronics  ...        25         750000        High
2         103        Chair    Furniture  ...        50         125000         Low
3         104         Book   Stationery  ...       200          80000         Low
4         105   Headphones  Electronics  ...       100         150000         Low

[5 rows x 7 columns]


In [0]:
# Group by category and calculate the total quantity sold
category_quantity = df_csv.groupby('category')['quantity'].sum().reset_index()
print(category_quantity)


      category  quantity
0  Electronics       150
1    Furniture        95
2   Stationery      1000


In [0]:
# Calculate the average price for each category
category_avg_price = df_csv.groupby('category')['price'].mean().reset_index()
print(category_avg_price)


      category    price
0  Electronics  24625.0
1    Furniture  16900.0
2   Stationery    160.0


In [0]:
# Count how many products fall in each price band
price_band_count = df_csv['price_band'].value_counts()
print(price_band_count)


Low       5
High      4
Medium    1
Name: price_band, dtype: int64


In [0]:
# Filter Electronics products with price > 5000
electronics_filtered = df_csv[(df_csv['category'] == 'Electronics') & (df_csv['price'] > 5000)]

# Save to Parquet file
electronics_filtered_parquet = pa.Table.from_pandas(electronics_filtered)
pq.write_table(electronics_filtered_parquet, 'electronics_filtered.parquet')


In [0]:
# Filter Stationery products
stationery_filtered = df_csv[df_csv['category'] == 'Stationery']

# Save to JSON file
stationery_filtered.to_json('stationery_products.json', orient='records', lines=True)


In [0]:
import pyarrow.parquet as pq

# Read Parquet file
df_parquet = pq.read_table('products.parquet').to_pandas()

# Check the column names
print(df_parquet.columns)

# Calculate total revenue by category
category_revenue = df_parquet.groupby('category')['revenue'].sum().reset_index()

# Find category with highest total revenue
highest_revenue_category = category_revenue.sort_values(by='total_revenue', ascending=False).head(1)
print(highest_revenue_category)


In [0]:
from pyspark.sql import SparkSession

# Start Spark session
spark = SparkSession.builder.appName('SparkSQL').getOrCreate()

# Create DataFrame from pandas
spark_df = spark.createDataFrame(df_csv)

# Create a temporary view
spark_df.createOrReplaceTempView("products_view")

# Run Spark SQL to find products with quantity > 100 and price < 1000
result = spark.sql("SELECT * FROM products_view WHERE quantity > 100 AND price < 1000")

# Show the result
result.show()


+----------+------------+----------+-----+--------+-------------+----------+
|product_id|product_name|  category|price|quantity|total_revenue|price_band|
+----------+------------+----------+-----+--------+-------------+----------+
|       104|        Book|Stationery|  400|     200|        80000|       Low|
|       107|         Pen|Stationery|   20|     500|        10000|       Low|
|       109|    Notebook|Stationery|   60|     300|        18000|       Low|
+----------+------------+----------+-----+--------+-------------+----------+

