In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("dataset/data.csv")

print("📏 Shape:", df.shape)
display(df.head())

📏 Shape: (29, 7)


Unnamed: 0,Index,Age,Salary,Rating,Location,Established,Easy Apply
0,0,44.0,$44k-$99k,5.4,"India,In",1999,TRUE
1,1,66.0,$55k-$66k,3.5,"New York,Ny",2002,TRUE
2,2,,$77k-$89k,-1.0,"New York,Ny",-1,-1
3,3,64.0,$44k-$99k,4.4,India In,1988,-1
4,4,25.0,$44k-$99k,6.4,Australia Aus,2002,-1


In [4]:
# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Clean Age column
df['age'] = df['age'].fillna(df['age'].mean()).round(1)

# Clean Established column
df['established'] = df['established'].replace(-1, np.nan)
df['established'] = df['established'].fillna("Unknown")

# Clean Rating column
df['rating'] = df['rating'].replace(-1, np.nan)
df['rating'] = df['rating'].fillna(df['rating'].mean()).round(1)

# Clean Location column
df['location'] = df['location'].str.replace("Australia Aus", "Australia, Aus", regex=False)
df['location'] = df['location'].str.replace("India In", "India, In", regex=False)

# Split into city and code
df['location_city'] = df['location'].str.split(",", expand=True)[0].str.strip()
df['city_code'] = df['location'].str.split(",", expand=True)[1].str.strip()
df.drop(columns=['location'], inplace=True)

# Clean Easy Apply column
df['easy_apply'] = df['easy_apply'].replace("-1", "TRUE").astype(str).str.upper()

# Clean Salary column
# Convert "$44k-$99k" → min_salary, max_salary, avg_salary
salary_clean = df['salary'].str.replace("$", "", regex=False).str.replace("k", "000", regex=False)
salary_split = salary_clean.str.split("-", expand=True)

df['min_salary'] = salary_split[0].astype(float)
df['max_salary'] = salary_split[1].astype(float)
df['avg_salary'] = df[['min_salary', 'max_salary']].mean(axis=1)

df.drop(columns=['salary'], inplace=True)

In [6]:
# Final inspection
print("\n Cleaned Dataset Preview:")
display(df.head())


 Cleaned Dataset Preview:


Unnamed: 0,index,age,rating,established,easy_apply,location_city,city_code,min_salary,max_salary,avg_salary
0,0,44.0,5.4,1999.0,True,India,In,44000.0,99000.0,71500.0
1,1,66.0,3.5,2002.0,True,New York,Ny,55000.0,66000.0,60500.0
2,2,39.0,4.3,Unknown,True,New York,Ny,77000.0,89000.0,83000.0
3,3,64.0,4.4,1988.0,True,India,In,44000.0,99000.0,71500.0
4,4,25.0,6.4,2002.0,True,Australia,Aus,44000.0,99000.0,71500.0


In [7]:
print("\n📊 Summary Stats:")
display(df.describe(include='all'))


📊 Summary Stats:


Unnamed: 0,index,age,rating,established,easy_apply,location_city,city_code,min_salary,max_salary,avg_salary
count,29.0,29.0,29.0,29.0,29,29,29,29.0,29.0,29.0
unique,,,,19.0,1,3,3,,,
top,,,,1999.0,TRUE,New York,Ny,,,
freq,,,,5.0,29,12,12,,,
mean,14.0,39.034483,4.286207,,,,,45551.724138,89103.448276,67327.586207
std,8.514693,13.973145,2.068423,,,,,16222.969182,18158.714609,14647.815453
min,0.0,13.0,0.0,,,,,10000.0,40000.0,29500.0
25%,7.0,32.0,3.4,,,,,44000.0,89000.0,71500.0
50%,14.0,39.0,4.3,,,,,44000.0,99000.0,71500.0
75%,21.0,44.0,5.4,,,,,44000.0,99000.0,71500.0


In [8]:
# Save cleaned dataset
df.to_csv("dataset/CleanedData.csv", index=False)
print("\n💾 Cleaned dataset saved to dataset/CleanedData.csv")


💾 Cleaned dataset saved to dataset/CleanedData.csv
