In [2]:
import pandas as pd
import numpy as np

In [4]:
# Sample DataFrame
df = pd.DataFrame({
    'Name': ['Alice', ' Bob ', None, 'Dave'],
    'Age': [25, np.nan, 22, 29],
    'Score': [85, 90, np.nan, 88]
})

In [5]:
# 1. Check for missing values
print(df.isna())

    Name    Age  Score
0  False  False  False
1  False   True  False
2   True  False   True
3  False  False  False


In [6]:
# 2. Drop rows with any missing values
df_dropped = df.dropna()

In [8]:
# Check dropped
print(df_dropped.isna())

    Name    Age  Score
0  False  False  False
3  False  False  False


In [9]:
# 3. Fill missing values
df_filled = df.fillna({'Age': df['Age'].mean(), 'Score': 0, 'Name': 'Unknown'})

In [11]:
# check filled
print(df_filled)

      Name        Age  Score
0    Alice  25.000000   85.0
1     Bob   25.333333   90.0
2  Unknown  22.000000    0.0
3     Dave  29.000000   88.0


In [25]:
# 4. Replace values
df_replaced = df.replace({'Bob': '           Bahadur   '})
# print
print(df_replaced)

                    Name   Age  Score
0                  Alice  25.0   85.0
1             Bahadur      NaN   90.0
2                   None  22.0    NaN
3                   Dave  29.0   88.0


In [26]:
# 5. Strip whitespace from 'Name'
df_replaced['Name'] = df_replaced['Name'].str.strip()
print(df_replaced)

      Name   Age  Score
0    Alice  25.0   85.0
1  Bahadur   NaN   90.0
2     None  22.0    NaN
3     Dave  29.0   88.0


In [29]:
# 6. Rename columns
df_replaced.rename(columns={'Score': 'Exam Score'}, inplace=True)
print(df_replaced)

      Name   Age  Exam Score
0    Alice  25.0        85.0
1  Bahadur   NaN        90.0
2     None  22.0         NaN
3     Dave  29.0        88.0


In [32]:
# 7. Change data type of Age to int (after filling NaNs)
df_filled['Age'] = df_filled['Age'].fillna(0).astype(int)
print(df_filled)

      Name  Age  Score
0    Alice   25   85.0
1     Bob    25   90.0
2  Unknown   22    0.0
3     Dave   29   88.0


In [33]:
df_filled.sort_values(by='Score', ascending=False)
df_filled.sort_index()

Unnamed: 0,Name,Age,Score
0,Alice,25,85.0
1,Bob,25,90.0
2,Unknown,22,0.0
3,Dave,29,88.0


In [34]:
df_filled[df_filled['Age'] > 25]
df_filled[(df_filled['Age'] > 25) & (df_filled['Score'] > 80)]

Unnamed: 0,Name,Age,Score
3,Dave,29,88.0


In [36]:
# Using map on a Series
df_filled['Age Group'] = df_filled['Age'].map(lambda x: 'Adult' if x >= 18 else 'Minor')
print(df_filled)

      Name  Age  Score Age Group
0    Alice   25   85.0     Adult
1     Bob    25   90.0     Adult
2  Unknown   22    0.0     Adult
3     Dave   29   88.0     Adult


In [38]:
# Using apply on a row
df_filled['Name Length'] = df_filled['Name'].apply(len)
print(df_filled)

      Name  Age  Score Age Group  Name Length
0    Alice   25   85.0     Adult            5
1     Bob    25   90.0     Adult            5
2  Unknown   22    0.0     Adult            7
3     Dave   29   88.0     Adult            4


In [40]:
# Replacing multiple values
df_filled['Name'] = df_filled['Name'].replace(['Alice', 'Bob'], ['A', 'B'])
print(df_filled)

      Name  Age  Score Age Group  Name Length
0        A   25   85.0     Adult            5
1     Bob    25   90.0     Adult            5
2  Unknown   22    0.0     Adult            7
3     Dave   29   88.0     Adult            4


In [45]:
df_filled.describe()

Unnamed: 0,Age,Score,Name Length
count,4.0,4.0,4.0
mean,25.25,65.75,5.25
std,2.872281,43.881469,1.258306
min,22.0,0.0,4.0
25%,24.25,63.75,4.75
50%,25.0,86.5,5.0
75%,26.0,88.5,5.5
max,29.0,90.0,7.0


In [46]:
df_filled['Score'].mean()

np.float64(65.75)

In [47]:
df_filled['Age'].median()

np.float64(25.0)

In [48]:
df_filled['Score'].std()

np.float64(43.88146913371672)

In [52]:
df_filled.groupby('Age Group')['Score'].agg(['mean', 'max', 'min'])

Unnamed: 0_level_0,mean,max,min
Age Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adult,65.75,90.0,0.0


In [53]:
df['Name'].nunique()

3

In [54]:
df['Name'].unique()

array(['Alice', 'Bob', None, 'Dave'], dtype=object)

In [55]:
df['Age'].value_counts()


Age
25.0    1
22.0    1
29.0    1
Name: count, dtype: int64

In [56]:
df['Name'].str.upper()

0    ALICE
1      BOB
2     None
3     DAVE
Name: Name, dtype: object

In [57]:
df['Name'].str.contains("a")

0    False
1    False
2     None
3     True
Name: Name, dtype: object

In [60]:
df.set_index('Name', inplace=True)
print(df)

        Age  Score
Name              
Alice  25.0   85.0
Bob     NaN   90.0
None   22.0    NaN
Dave   29.0   88.0


In [61]:
df.reset_index(inplace=True)
print(df)

    Name   Age  Score
0  Alice  25.0   85.0
1    Bob   NaN   90.0
2   None  22.0    NaN
3   Dave  29.0   88.0


In [64]:
df.query('Age > 25 and Score > 80')

Unnamed: 0,Name,Age,Score
3,Dave,29.0,88.0


In [67]:
df_filled.pivot_table(values='Score', index='Age Group', aggfunc='mean')

Unnamed: 0_level_0,Score
Age Group,Unnamed: 1_level_1
Adult,65.75


In [68]:
df_filled.to_csv("cleaned_data.csv", index=False)

In [69]:
pd.read_csv("cleaned_data.csv")

Unnamed: 0,Name,Age,Score,Age Group,Name Length
0,A,25,85.0,Adult,5
1,Bob,25,90.0,Adult,5
2,Unknown,22,0.0,Adult,7
3,Dave,29,88.0,Adult,4


In [73]:
df_filled['Category'] = pd.Categorical(['Low', 'Medium', 'High', 'Very High'])
print(df_filled)

      Name  Age  Score Age Group  Name Length   Category
0        A   25   85.0     Adult            5        Low
1     Bob    25   90.0     Adult            5     Medium
2  Unknown   22    0.0     Adult            7       High
3     Dave   29   88.0     Adult            4  Very High


In [75]:
df1 = pd.DataFrame({
    'StudentID': [1, 2],
    'Name': ['Alice', 'Bob']
})

df2 = pd.DataFrame({
    'StudentID': [3, 4],
    'Name': ['Charlie', 'David']
})

In [78]:
df_concat = pd.concat([df1, df2], ignore_index=True)
print(df_concat)

   StudentID     Name
0          1    Alice
1          2      Bob
2          3  Charlie
3          4    David


In [81]:
# Merge based on StudentID (INNER JOIN by default)

df1 = pd.DataFrame({
    'StudentID': [1, 2],
    'Name': ['Alice', 'Bob']
})

df2 = pd.DataFrame({
    'StudentID': [1, 2],
    'Name': ['Charlie', 'David']
})


df_merged = pd.merge(df1, df2, on='StudentID', how='inner')
print(df_merged)

   StudentID Name_x   Name_y
0          1  Alice  Charlie
1          2    Bob    David
