# 📘 Guided Internship Notebook: Reading & Subsetting Data in Python

## 🧾 1. Load and Preview CSV Data

In [None]:
import pandas as pd

# Load the Big Mart dataset
csv_path = "dataset/big_mart_sales.csv"
df_csv = pd.read_csv(csv_path)
df_csv.head()

# ✅ Task 1:
# - View column names
# - Check for null values
print(df_csv.columns)
df_csv.isnull().sum()

# 💡 Use Case: Top 5 products with highest sales
df_csv.groupby("Item_Identifier")["Item_Outlet_Sales"].sum().sort_values(ascending=False).head()

## 📑 2. Load JSON Data

In [None]:
import json

# Load the JSON data
json_path = "dataset/simple.json"
with open(json_path, 'r') as f:
    json_data = json.load(f)

# Convert to DataFrame
df_json = pd.json_normalize(json_data)
df_json.head()

## ✅ Task 2:
# - Flatten nested JSON
# - Filter values based on criteria

## 📊 3. Subsetting Techniques

# 📌 Position-Based
# First 5 rows & specific columns
df_csv.iloc[:5, [0, 1, 3]]

# 📌 Label-Based
# Filter rows from a specific outlet
df_csv.loc[df_csv['Outlet_Identifier'] == 'OUT049']

# 📌 Value-Based
# Filter where sales > 5000 and item type is Snacks
df_csv[(df_csv['Item_Outlet_Sales'] > 5000) & (df_csv['Item_Type'] == 'Snack Foods')]

## 🛠️ 4. Modifying Data

# Add a new column: Profit = Sales - MRP
if 'Item_MRP' in df_csv.columns:
    df_csv['Estimated_Profit'] = df_csv['Item_Outlet_Sales'] - df_csv['Item_MRP']

# Rename columns
df_csv.rename(columns={'Item_Weight': 'Weight_kg'}, inplace=True)

# Drop missing values
df_csv.dropna(inplace=True)

## 📈 5. Additional Use Cases

# 🔍 Find items sold in maximum number of outlets
df_csv.groupby('Item_Identifier')['Outlet_Identifier'].nunique().sort_values(ascending=False).head()

# 📉 Outlier Detection for Sales
import matplotlib.pyplot as plt
import seaborn as sns
sns.boxplot(df_csv['Item_Outlet_Sales'])
plt.title("Outlier Detection: Sales")
plt.show()

## 📌 Final Task: Summary Report

# 1. Load all files (CSV, JSON)
# 2. Perform at least 3 subsetting operations
# 3. Modify dataset (add column, drop NA, rename)
# 4. Extract insights (top categories, summary stats)
# 5. Visualize at least 1 trend

# ✅ Submit: `.ipynb` file + 1-page summary in `.pdf`