# employee data

# Mini Project ‚Äî Employee Salary Analysis (Pandas)

---

## üìå Project Overview
This project analyzes employee salary data to understand:
- Salary distribution
- Impact of experience on salary
- Salary differences by gender and position
- Highest and lowest paid roles

---

## üìÇ Dataset Columns
- ID
- Gender
- Experience (Years)
- Position
- Salary


In [None]:
# 1Ô∏è‚É£ Import Required Libraries
import pandas as pd


: 

In [None]:
# 2Ô∏è‚É£ Load Dataset
df = pd.read_csv("datasets/employee_salary.csv")

# Preview data
df.head()


## 3Ô∏è‚É£ Initial Data Exploration


In [None]:
print("Shape of dataset:", df.shape)
print("\nColumn Names:")
print(df.columns)

print("\nDataset Info:")
df.info()

print("\nMissing Values:")
df.isnull().sum()


In [None]:
# Statistical summary
df.describe()


## 4Ô∏è‚É£ Data Cleaning


In [None]:
# Rename column for easier use
df.rename(columns={'Experience (Years)': 'Experience_Years'}, inplace=True)

df.head()


In [None]:
# Check duplicates
print("Duplicate rows:", df.duplicated().sum())

# Remove duplicates if any
df = df.drop_duplicates()


## 5Ô∏è‚É£ Overall Salary Analysis


In [None]:
print("Total Salary:", df['Salary'].sum())
print("Average Salary:", df['Salary'].mean())
print("Highest Salary:", df['Salary'].max())
print("Lowest Salary:", df['Salary'].min())


## 6Ô∏è‚É£ Salary Analysis by Gender


In [None]:
gender_salary = df.groupby('Gender')['Salary'].agg(
    Count='count',
    Average='mean',
    Minimum='min',
    Maximum='max'
)

gender_salary


## 7Ô∏è‚É£ Salary Analysis by Position


In [None]:
position_salary = df.groupby('Position')['Salary'].agg(
    Count='count',
    Average='mean',
    Minimum='min',
    Maximum='max'
).sort_values(by='Average', ascending=False)

position_salary


## 8Ô∏è‚É£ Experience vs Salary Analysis


In [None]:
experience_salary = df.groupby('Experience_Years')['Salary'].mean()
experience_salary.head()


In [None]:
# Employees with more than 10 years of experience
senior_employees = df[df['Experience_Years'] > 10]
senior_employees[['Position', 'Experience_Years', 'Salary']]


## 9Ô∏è‚É£ Top & Lowest Earners


In [None]:
# Top 5 highest paid employees
top_earners = df.sort_values(by='Salary', ascending=False).head(5)
top_earners


In [None]:
# Bottom 5 lowest paid employees
low_earners = df.sort_values(by='Salary').head(5)
low_earners


## üîü Gender Distribution by Position


In [None]:
gender_position = df.groupby(['Position', 'Gender']).size().unstack(fill_value=0)
gender_position


## 1Ô∏è‚É£1Ô∏è‚É£ Key Insights


- Salary increases with experience, but role matters significantly
- Database Administrators and Web Developers earn the highest on average
- IT Managers have consistently high salaries
- Gender distribution varies by role
- Entry-level roles show lower salary ranges


## 1Ô∏è‚É£2Ô∏è‚É£ Save Cleaned Dataset


In [None]:
df.to_csv("datasets/employee_salary_cleaned.csv", index=False)


# ‚úÖ Project Completed Successfully

### Skills Applied:
- Pandas Basics
- Data Cleaning
- Data Selection & Filtering
- GroupBy & Aggregation
- Real-world HR Salary Analysis


# sales data 

In [None]:
# Step 1: Load the CSV data (using Pandas for easy handling) and convert to NumPy do this
#  after pandas covered
import pandas as pd
import numpy as np

# Load the dataset
file_path = "path_to_your_sales_data.csv"  # Replace with your actual file path

# # Try 'latin1' encoding first
# df = pd.read_csv(file_path, encoding='latin1')
# #your CSV file is not encoded in UTF-8, which is the default encoding Pandas assumes.
#  #Some common alternatives are latin1 (aka ISO-8859-1) 
# #or cp1252, especially if the file came from Excel on Windows.

df = pd.read_csv(file_path) 

# # Display basic info
print("Data columns:", df.columns.tolist())
print("First 5 rows:")
print(df.head())

# Extract relevant columns for analysis
# For example: QUANTITYORDERED, PRICEEACH, SALES, ORDERDATE, PRODUCTLINE
data_numeric = df[['QUANTITYORDERED', 'PRICEEACH', 'SALES']].to_numpy()

print("\nSample numeric data:")
print(data_numeric[:5])

# Step 2: Sales Data Summary using NumPy

# Total sales amount
total_sales = np.sum(data_numeric[:, 2])
print(f"Total Sales Amount: ${total_sales:,.2f}")

# Average sales per order
average_sales = np.mean(data_numeric[:, 2])
print(f"Average Sales per Order: ${average_sales:,.2f}")

# Total quantity ordered
total_quantity = np.sum(data_numeric[:, 0])
print(f"Total Quantity Ordered: {total_quantity}")

# Average price each
average_price_each = np.mean(data_numeric[:, 1])
print(f"Average Price Each: ${average_price_each:.2f}")

# Step 3: Monthly Sales Analysis

# Convert ORDERDATE to datetime using pandas
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'])

# Extract month and year for grouping
df['YearMonth'] = df['ORDERDATE'].dt.to_period('M')

# Group sales by YearMonth
monthly_sales = df.groupby('YearMonth')['SALES'].sum()

print("\nMonthly Sales Summary:")
print(monthly_sales)

# Convert monthly sales to NumPy array for further processing
monthly_sales_np = monthly_sales.to_numpy()

# Step 4: Product Line Sales Analysis

product_line_sales = df.groupby('PRODUCTLINE')['SALES'].sum()

print("\nSales by Product Line:")
print(product_line_sales)

# Optionally convert to NumPy
product_line_sales_np = product_line_sales.to_numpy()

# Step 5: Top 3 Customers by Sales

top_customers = df.groupby('CUSTOMERNAME')['SALES'].sum().sort_values(ascending=False).head(3)

print("\nTop 3 Customers by Sales:")
print(top_customers)

# Step 6: Additional Insights

# Average order size by DEALSIZE
avg_order_by_dealsize = df.groupby('DEALSIZE')['SALES'].mean()
print("\nAverage Sales by Deal Size:")
print(avg_order_by_dealsize)

# Number of orders per status
order_status_counts = df['STATUS'].value_counts()
print("\nOrder Status Counts:")
print(order_status_counts)