In [1]:
import pandas as pd
import numpy as np

In [2]:
# Creating a dataset
data = {
    'ID': [1, 2, 2, 3, 4, 5, 6, 7, 8, 9],
    'Name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', None, 'Hannah', 'Ivan'],
    'Age': [25, 30, 30, np.nan, 45, 29, -5, 40, 35, 50],
    'Salary': [50000, 54000, 54000, 60000, None, 58000, 62000, 59000, 60000, 70000]
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

Original Data:
   ID     Name   Age   Salary
0   1    Alice  25.0  50000.0
1   2      Bob  30.0  54000.0
2   2      Bob  30.0  54000.0
3   3  Charlie   NaN  60000.0
4   4    David  45.0      NaN
5   5      Eve  29.0  58000.0
6   6    Frank  -5.0  62000.0
7   7     None  40.0  59000.0
8   8   Hannah  35.0  60000.0
9   9     Ivan  50.0  70000.0


In [3]:
df.shape

(10, 4)

In [4]:
df.head()

Unnamed: 0,ID,Name,Age,Salary
0,1,Alice,25.0,50000.0
1,2,Bob,30.0,54000.0
2,2,Bob,30.0,54000.0
3,3,Charlie,,60000.0
4,4,David,45.0,


In [5]:
df.tail(2)

Unnamed: 0,ID,Name,Age,Salary
8,8,Hannah,35.0,60000.0
9,9,Ivan,50.0,70000.0


In [6]:
df.describe()

Unnamed: 0,ID,Age,Salary
count,10.0,9.0,9.0
mean,4.7,31.0,58555.555556
std,2.750757,15.779734,5725.188012
min,1.0,-5.0,50000.0
25%,2.25,29.0,54000.0
50%,4.5,30.0,59000.0
75%,6.75,40.0,60000.0
max,9.0,50.0,70000.0


In [7]:
df.dtypes

ID          int64
Name       object
Age       float64
Salary    float64
dtype: object

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      10 non-null     int64  
 1   Name    9 non-null      object 
 2   Age     9 non-null      float64
 3   Salary  9 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 452.0+ bytes


In [10]:
df.sample()

Unnamed: 0,ID,Name,Age,Salary
1,2,Bob,30.0,54000.0


In [11]:
# Handling Missing Values
df.isnull().sum()

ID        0
Name      1
Age       1
Salary    1
dtype: int64

In [None]:
# df.dropna(inplace=True)

In [12]:
df

Unnamed: 0,ID,Name,Age,Salary
0,1,Alice,25.0,50000.0
1,2,Bob,30.0,54000.0
2,2,Bob,30.0,54000.0
3,3,Charlie,,60000.0
4,4,David,45.0,
5,5,Eve,29.0,58000.0
6,6,Frank,-5.0,62000.0
7,7,,40.0,59000.0
8,8,Hannah,35.0,60000.0
9,9,Ivan,50.0,70000.0


In [13]:
df['Age'].mean()

31.0

In [14]:
# Fill missing values
df['Name'].fillna("Unknown", inplace=True)
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)

In [15]:
df

Unnamed: 0,ID,Name,Age,Salary
0,1,Alice,25.0,50000.0
1,2,Bob,30.0,54000.0
2,2,Bob,30.0,54000.0
3,3,Charlie,31.0,60000.0
4,4,David,45.0,59000.0
5,5,Eve,29.0,58000.0
6,6,Frank,-5.0,62000.0
7,7,Unknown,40.0,59000.0
8,8,Hannah,35.0,60000.0
9,9,Ivan,50.0,70000.0


In [17]:
# Checking duplicates
df[df.duplicated()]

Unnamed: 0,ID,Name,Age,Salary
2,2,Bob,30.0,54000.0


In [18]:
# Removing duplicates
df.drop_duplicates(inplace=True)

In [21]:
df

Unnamed: 0,ID,Name,Age,Salary
0,1,Alice,25.0,50000.0
1,2,Bob,30.0,54000.0
3,3,Charlie,31.0,60000.0
4,4,David,45.0,59000.0
5,5,Eve,29.0,58000.0
6,6,Frank,-5.0,62000.0
7,7,Unknown,40.0,59000.0
8,8,Hannah,35.0,60000.0
9,9,Ivan,50.0,70000.0


In [22]:
df[df.duplicated()]

Unnamed: 0,ID,Name,Age,Salary


In [23]:
df['Age']

0    25.0
1    30.0
3    31.0
4    45.0
5    29.0
6    -5.0
7    40.0
8    35.0
9    50.0
Name: Age, dtype: float64

In [24]:
# Handling Noise / Outliers
df.loc[df['Age'] < 0, 'Age'] = df['Age'].median()

In [25]:
df

Unnamed: 0,ID,Name,Age,Salary
0,1,Alice,25.0,50000.0
1,2,Bob,30.0,54000.0
3,3,Charlie,31.0,60000.0
4,4,David,45.0,59000.0
5,5,Eve,29.0,58000.0
6,6,Frank,31.0,62000.0
7,7,Unknown,40.0,59000.0
8,8,Hannah,35.0,60000.0
9,9,Ivan,50.0,70000.0


In [28]:
df.dtypes

ID          int64
Name       object
Age         int32
Salary    float64
dtype: object

In [27]:
# Data Conversion
df['Age'] = df['Age'].astype(int)

In [29]:
df['Salary']

0    50000.0
1    54000.0
3    60000.0
4    59000.0
5    58000.0
6    62000.0
7    59000.0
8    60000.0
9    70000.0
Name: Salary, dtype: float64

In [30]:
# Normalization (Optional)
df['Salary_Normalized'] = (df['Salary'] - df['Salary'].min()) / (df['Salary'].max() - df['Salary'].min())

In [31]:
df

Unnamed: 0,ID,Name,Age,Salary,Salary_Normalized
0,1,Alice,25,50000.0,0.0
1,2,Bob,30,54000.0,0.2
3,3,Charlie,31,60000.0,0.5
4,4,David,45,59000.0,0.45
5,5,Eve,29,58000.0,0.4
6,6,Frank,31,62000.0,0.6
7,7,Unknown,40,59000.0,0.45
8,8,Hannah,35,60000.0,0.5
9,9,Ivan,50,70000.0,1.0


In [32]:
# Exporting Cleaned Data
df.to_csv("cleaned_data.csv", index=True)