In [2]:
# -----------------------------------------------------------
# 🧒 PRACTICAL: Getting Started with pandas Library
# -----------------------------------------------------------
import pandas as pd
import numpy as np


In [4]:
# -----------------------------------------------------------
# 🔹 1. BASIC PANDAS DATA STRUCTURES
# -----------------------------------------------------------

# Create a Series (1D labeled array with custom index)
s=pd.Series([10,20,30,40], index=['a','b','c','d'])
print("Series : ")
print(s)

# Create a DataFrame (2D labeled table)

data = {
    'Name':['Alice','Bob','Charlie','David'],
    'Age':[25,30,35,40],
    'Salary':[50000,60000,70000,80000]
}
df=pd.DataFrame(data)
print("\nData Frame:")
print(df)

Series : 
a    10
b    20
c    30
d    40
dtype: int64

Data Frame:
      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000
3    David   40   80000


In [5]:
# -----------------------------------------------------------
# 🔹 2. CONVERT ARRAY TO DATAFRAME
# -----------------------------------------------------------

# Convert a NumPy array to a DataFrame

array = np.array([[1,2],[3,4],[5,6]])
df_from_array = pd.DataFrame(array, columns=['Column1','Column2'])
print("\nDataFrame from Numpy array:")
print(df_from_array)


DataFrame from Numpy array:
   Column1  Column2
0        1        2
1        3        4
2        5        6


In [6]:
# -----------------------------------------------------------
# 🔹 3. INDEXING & SLICING IN DATAFRAME
# -----------------------------------------------------------

# Access a row by label using loc

print("\nRow with label 1 using loc:")
print(df.loc[1])


Row with label 1 using loc:
Name        Bob
Age          30
Salary    60000
Name: 1, dtype: object


In [7]:
# Access rows by position using iloc
print("\nFirst two rows using iloc : ")
print(df.iloc[0:2])


First two rows using iloc : 
    Name  Age  Salary
0  Alice   25   50000
1    Bob   30   60000


In [9]:
# Select a single column by label
print("\nSelect 'Name' column : ")
print(df['Name'])

# Select multiple columns
print("\nSelect 'Name' and 'Age' columns:")
print(df[['Name','Age']])

#filter rows whre a condition is true
print("\nRows where salary>60000:")
print(df[df['Salary']>60000])


Select 'Name' column : 
0      Alice
1        Bob
2    Charlie
3      David
Name: Name, dtype: object

Select 'Name' and 'Age' columns:
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
3    David   40

Rows where salary>60000:
      Name  Age  Salary
2  Charlie   35   70000
3    David   40   80000


In [10]:
# -----------------------------------------------------------
# 🔹 4. DATA CLEANING
# -----------------------------------------------------------

# Create a DataFrame with missing (NaN) values

data_dirty = {
    'Name':['Alice','Bob',None,'David'],
    'Age':[25,None,35,40],
    'Salary':[50000,60000,None, 80000]
}
df_dirty = pd.DataFrame(data_dirty)
print("\nOriginal Dirty DataFrame : ")
print(df_dirty)



Original Dirty DataFrame : 
    Name   Age   Salary
0  Alice  25.0  50000.0
1    Bob   NaN  60000.0
2   None  35.0      NaN
3  David  40.0  80000.0


In [11]:
# Drop rows with any missing values
print("\nDrop rows with any NaN values : ")
print(df_dirty.dropna())

#fill missing values with specified values
print("\nFill missing values with a specific value : ")
print(df_dirty.fillna({'Name':'Unknown','Age':df_dirty['Age'].mean(),'Salary':0}))


Drop rows with any NaN values : 
    Name   Age   Salary
0  Alice  25.0  50000.0
3  David  40.0  80000.0

Fill missing values with a specific value : 
      Name        Age   Salary
0    Alice  25.000000  50000.0
1      Bob  33.333333  60000.0
2  Unknown  35.000000      0.0
3    David  40.000000  80000.0


In [12]:
# Detect missing values with a Boolean DataFrame
print("\nBoolean DataFrame for missing values : ")
print(df_dirty.isnull())


Boolean DataFrame for missing values : 
    Name    Age  Salary
0  False  False   False
1  False   True   False
2   True  False    True
3  False  False   False


In [13]:
# -----------------------------------------------------------
# 🔹 5. DATA MANIPULATION
# -----------------------------------------------------------

# Add a new column based on calculations
df['Bonus'] = df['Salary'] * 0.1
print("\nDataFrame wwith Bonus column:")
print(df)

#rename columns using rename()
print("\nRename columns : ")
df_renamed = df.rename(columns={'Salary':'BaseSalary'})
print(df_renamed)

# Sort rows by a column in descending order
print("\nSort by Age descending : ")
print(df.sort_values(by='Age', ascending=False))

# Group by a derived boolean column and calculate mean
print("\nGroup by operation (average salary by age > 30 or not):")
df['AgeGroup'] = df['Age'] > 30
print(df.groupby('AgeGroup')['Salary'].mean())



DataFrame wwith Bonus column:
      Name  Age  Salary   Bonus
0    Alice   25   50000  5000.0
1      Bob   30   60000  6000.0
2  Charlie   35   70000  7000.0
3    David   40   80000  8000.0

Rename columns : 
      Name  Age  BaseSalary   Bonus
0    Alice   25       50000  5000.0
1      Bob   30       60000  6000.0
2  Charlie   35       70000  7000.0
3    David   40       80000  8000.0

Sort by Age descending : 
      Name  Age  Salary   Bonus
3    David   40   80000  8000.0
2  Charlie   35   70000  7000.0
1      Bob   30   60000  6000.0
0    Alice   25   50000  5000.0

Group by operation (average salary by age > 30 or not):
AgeGroup
False    55000.0
True     75000.0
Name: Salary, dtype: float64


In [14]:
# Rename columns using rename()

print("\nRename columns : ")
df_renamed = df.rename(columns={'Salary':'BaseSalary'})
print(df_renamed)


Rename columns : 
      Name  Age  BaseSalary   Bonus  AgeGroup
0    Alice   25       50000  5000.0     False
1      Bob   30       60000  6000.0     False
2  Charlie   35       70000  7000.0      True
3    David   40       80000  8000.0      True


In [15]:
# Sort rows by a column in descending order
print("\nSort by age descending :")
print(df.sort_values(by='Age',ascending=False))

#Group by a derived boolean column and calculate mean
print("\nGroup by operation (average salry by age>30 or not):")
df['AgeGroup'] = df['Age']>30
print(df.groupby('AgeGroup')['Salary'].mean())


Sort by age descending :
      Name  Age  Salary   Bonus  AgeGroup
3    David   40   80000  8000.0      True
2  Charlie   35   70000  7000.0      True
1      Bob   30   60000  6000.0     False
0    Alice   25   50000  5000.0     False

Group by operation (average salry by age>30 or not):
AgeGroup
False    55000.0
True     75000.0
Name: Salary, dtype: float64


In [16]:
# -----------------------------------------------------------
# 🔹 6. SYNTHETIC DATA CREATION FOR PRACTICE
# -----------------------------------------------------------

# Create a synthetic DataFrame for practice with random data
np.random.seed(1)
num_rows = 10
df_synth = pd.DataFrame({
    'ID': np.arange(1, num_rows + 1),
    'Score': np.random.randint(50, 100, size=num_rows),
    'Passed': np.random.choice([True, False], size=num_rows)
})

print("\nSynthetic DataFrame:")
print(df_synth)




Synthetic DataFrame:
   ID  Score  Passed
0   1     87   False
1   2     93    True
2   3     62   False
3   4     58   False
4   5     59    True
5   6     61    True
6   7     55   False
7   8     65    True
8   9     50    True
9  10     66    True
