In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8],
    "name": ["Alice", "Bob", "Charlie", "Bob", None, "Eve", "Frank", "Alice"],
    "age": [25, 34, None, 28, 40, 22, None, 30],
    "income": [55000.0, 72000.5, 30000.0, None, 120000.0, 48000.0, 60000.0, None],
    "signup_date": ["2024-01-15","2023-11-03","2024-06-10","2022-12-20",
                    None,"2024-07-01","2023-08-08","2024-01-15"],
    "is_subscribed": [True, False, True, False, True, None, False, True],
    "rating": [1000, 2000, 1500, 3000, 1200, 2500, 2000, 1000]
}

In [3]:
df=pd.DataFrame(data)

In [4]:
df

Unnamed: 0,id,name,age,income,signup_date,is_subscribed,rating
0,1,Alice,25.0,55000.0,2024-01-15,True,1000
1,2,Bob,34.0,72000.5,2023-11-03,False,2000
2,3,Charlie,,30000.0,2024-06-10,True,1500
3,4,Bob,28.0,,2022-12-20,False,3000
4,5,,40.0,120000.0,,True,1200
5,6,Eve,22.0,48000.0,2024-07-01,,2500
6,7,Frank,,60000.0,2023-08-08,False,2000
7,8,Alice,30.0,,2024-01-15,True,1000


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             8 non-null      int64  
 1   name           7 non-null      object 
 2   age            6 non-null      float64
 3   income         6 non-null      float64
 4   signup_date    7 non-null      object 
 5   is_subscribed  7 non-null      object 
 6   rating         8 non-null      int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 580.0+ bytes


In [6]:
df=df.drop('id',axis=1)

In [7]:
print("===== ORIGINAL DATAFRAME =====")
df

===== ORIGINAL DATAFRAME =====


Unnamed: 0,name,age,income,signup_date,is_subscribed,rating
0,Alice,25.0,55000.0,2024-01-15,True,1000
1,Bob,34.0,72000.5,2023-11-03,False,2000
2,Charlie,,30000.0,2024-06-10,True,1500
3,Bob,28.0,,2022-12-20,False,3000
4,,40.0,120000.0,,True,1200
5,Eve,22.0,48000.0,2024-07-01,,2500
6,Frank,,60000.0,2023-08-08,False,2000
7,Alice,30.0,,2024-01-15,True,1000


In [8]:
print("\n===== UNIQUE VALUE COUNTS =====")
df.nunique(dropna=False)


===== UNIQUE VALUE COUNTS =====


name             6
age              7
income           7
signup_date      7
is_subscribed    3
rating           6
dtype: int64

In [9]:
print("\n===== COLUMN DATA TYPES (BEFORE) =====")
df.dtypes


===== COLUMN DATA TYPES (BEFORE) =====


name              object
age              float64
income           float64
signup_date       object
is_subscribed     object
rating             int64
dtype: object

In [10]:
df['signup_date'] = pd.to_datetime(df['signup_date'],errors='coerce')
df['rating'] = df['rating'].astype('int16')
df['is_subscribed'] = df['is_subscribed'].astype('boolean')

In [11]:
print("\n===== COLUMN DATA TYPES (AFTER CONVERSION) =====")
df.dtypes


===== COLUMN DATA TYPES (AFTER CONVERSION) =====


name                     object
age                     float64
income                  float64
signup_date      datetime64[ns]
is_subscribed           boolean
rating                    int16
dtype: object

In [12]:
print("\n===== MISSING VALUES (BEFORE FILLING) =====")
print(df.isnull().sum())


===== MISSING VALUES (BEFORE FILLING) =====
name             1
age              2
income           2
signup_date      1
is_subscribed    1
rating           0
dtype: int64


In [13]:
df

Unnamed: 0,name,age,income,signup_date,is_subscribed,rating
0,Alice,25.0,55000.0,2024-01-15,True,1000
1,Bob,34.0,72000.5,2023-11-03,False,2000
2,Charlie,,30000.0,2024-06-10,True,1500
3,Bob,28.0,,2022-12-20,False,3000
4,,40.0,120000.0,NaT,True,1200
5,Eve,22.0,48000.0,2024-07-01,,2500
6,Frank,,60000.0,2023-08-08,False,2000
7,Alice,30.0,,2024-01-15,True,1000


In [14]:
df['name'] = df['name'].fillna(df['name'].mode()[0])
df['age']=df['age'].fillna(df['age'].median())
df['income']=df['income'].fillna(df['income'].median())
df['signup_date']=df['signup_date'].fillna(df['signup_date'].min())
df['is_subscribed'] = df['is_subscribed'].fillna(False)

In [15]:
df

Unnamed: 0,name,age,income,signup_date,is_subscribed,rating
0,Alice,25.0,55000.0,2024-01-15,True,1000
1,Bob,34.0,72000.5,2023-11-03,False,2000
2,Charlie,29.0,30000.0,2024-06-10,True,1500
3,Bob,28.0,57500.0,2022-12-20,False,3000
4,Alice,40.0,120000.0,2022-12-20,True,1200
5,Eve,22.0,48000.0,2024-07-01,False,2500
6,Frank,29.0,60000.0,2023-08-08,False,2000
7,Alice,30.0,57500.0,2024-01-15,True,1000


In [16]:
print("\n===== MISSING VALUES (After FILLING) =====")
print(df.isnull().sum())


===== MISSING VALUES (After FILLING) =====
name             0
age              0
income           0
signup_date      0
is_subscribed    0
rating           0
dtype: int64


In [17]:
df["income_per_age"] = df["income"] / df["age"]