In [40]:
### 1. Load and Display the Dataset  
#- Read the **eda_employee_data.csv** file into a Pandas **DataFrame**.  
#- Display the **first 5 rows** using `head()`.

import pandas as pd
employee_data=pd.read_csv("eda_employees_data.csv")
employee_data.head()

Unnamed: 0,Employee_ID,Name,Age,Department,Salary,Joining_Date,City
0,101,John Doe,28,IT,50000.0,2018-06-15,New York
1,102,Jane Smith,32,HR,60000.0,2016-09-23,Los Angeles
2,103,Emily Davis,45,Finance,80000.0,2012-11-04,Chicago
3,104,Michael Brown,29,IT,55000.0,2019-07-12,New York
4,105,Chris Wilson,35,Marketing,62000.0,2015-03-19,San Francisco


In [2]:
### 2. Find Basic Information  
#- Display **column names, data types, and missing values** using `info()`.  
#- Find the **number of rows and columns** using `shape`.

print(employee_data.info())
print(employee_data.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Employee_ID   14 non-null     int64  
 1   Name          13 non-null     object 
 2   Age           14 non-null     int64  
 3   Department    14 non-null     object 
 4   Salary        13 non-null     float64
 5   Joining_Date  14 non-null     object 
 6   City          14 non-null     object 
dtypes: float64(1), int64(2), object(4)
memory usage: 916.0+ bytes
None
(14, 7)


In [4]:
### 3. Identify Missing Values  
#- Check for **missing values** in each column using `isnull().sum()`.

print(employee_data.isnull().sum())

Employee_ID     0
Name            1
Age             0
Department      0
Salary          1
Joining_Date    0
City            0
dtype: int64


In [45]:
### 4. Handle Missing Values  
#- Fill missing **Names** with `"Unknown"`.  
#- Fill missing **Salaries** with the **average Salary** of all employees.

employee_data['Name'].fillna("Unknown", inplace=True)
employee_data['Salary'].fillna(employee_data['Salary'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  employee_data['Salary'].fillna(employee_data['Salary'].mean(), inplace=True)


In [46]:
### 5. Identify and Remove Duplicate Records  
#- Check how many duplicate records exist.  
#- Remove **duplicate entries** from the dataset.  

print("\nNumber of Duplicate Records:", employee_data.duplicated().sum())
employee_data.drop_duplicates(inplace=True)


Number of Duplicate Records: 0


In [47]:
### 6. Correct Incorrect Values  
#- Find and **fix negative Age values** by replacing them with the average Age.  

employee_data['Age'] = employee_data['Age'].abs()
print("\nCleaned Dataset:\n", employee_data)



Cleaned Dataset:
     Employee_ID           Name  Age Department         Salary Joining_Date  \
0           101       John Doe   28         IT   50000.000000   2018-06-15   
1           102     Jane Smith   32         HR   60000.000000   2016-09-23   
2           103    Emily Davis   45    Finance   80000.000000   2012-11-04   
3           104  Michael Brown   29         IT   55000.000000   2019-07-12   
4           105   Chris Wilson   35  Marketing   62000.000000   2015-03-19   
5           106  Sarah Johnson   40         HR   72000.000000   2014-05-28   
6           107      David Lee   50    Finance   95000.000000   2010-12-17   
7           108   Robert White   27  Marketing   49000.000000   2020-08-01   
8           109        Unknown   30         IT   54000.000000   2019-11-12   
9           110     Liam Green   29    Finance   67153.846154   2011-02-14   
10          111       John Doe   28         IT   50000.000000   2018-06-15   
11          112    Anna Taylor   25         H

In [48]:
### 7. View Summary Statistics  
#- Get summary statistics for **numerical columns** using `describe()`.
employee_data.describe()


Unnamed: 0,Employee_ID,Age,Salary
count,14.0,14.0,14.0
mean,107.5,33.5,67153.846154
std,4.1833,7.480745,20010.943751
min,101.0,25.0,49000.0
25%,104.25,28.25,54250.0
50%,107.5,30.5,61000.0
75%,110.75,38.75,71500.0
max,114.0,50.0,120000.0


In [50]:
### 8. Group Employees by Department  
#- Count the number of employees in each **Department**.  
#- Find the **average Salary** for each **Department**.  

department_stats = employee_data.groupby('Department').agg(
    Employee_Count=('Employee_ID', 'count'),  # Count employees
    Average_Salary=('Salary', 'mean')        # Calculate average salary
)
print("\nDepartment Statistics:\n", department_stats)



Department Statistics:
             Employee_Count  Average_Salary
Department                                
Finance                  4    90538.461538
HR                       3    67333.333333
IT                       5    53000.000000
Marketing                2    55500.000000


In [51]:
### 9. Convert Data Types  
#- Convert **Joining_Date** to **datetime** format.  
#- Convert **Age** to an integer format.

employee_data['Joining_Date'] = pd.to_datetime(employee_data['Joining_Date'])
employee_data['Age'] = employee_data['Age'].astype(int)

In [52]:
### 10. Save the Cleaned Data to a New CSV File  
#- Save the cleaned dataset as **cleaned_eda_employee_data.csv**.
employee_data.to_csv('cleaned_eda_employee_data.csv', index=False) 
print("\nCleaned data saved to cleaned_eda_employee_data.csv")



Cleaned data saved to cleaned_eda_employee_data.csv
