📦 Installation
pip install pandas
pip install --upgrade pandas


✅ Pandas Series
import pandas as pd

s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print(s)
Output:

Copy code
a    10
b    20
c    30
dtype: int64


✅ Pandas DataFrame

data = {'Name': ['Alice', 'Bob'], 'Age': [25, 30]}
df = pd.DataFrame(data, dtype='int64', index=['r1', 'r2'])
print(df)
Output:

markdown
Copy code
     Name  Age
r1  Alice   25
r2    Bob   30
📏 Shape
python
Copy code
print(df.shape)
Output:

scss
Copy code
(2, 2)


👀 Head & Tail
python
Copy code
print(df.head(1))
print(df.tail(1))
Output:

pgsql
Copy code
     Name  Age
r1  Alice   25
     Name  Age
r2    Bob   30


🔁 Convert to NumPy array
python
Copy code
print(df.values)
Output:

lua
Copy code
[['Alice' 25]
 ['Bob' 30]]
📄 Columns
python
Copy code
print(df.columns)
Output:

pgsql
Copy code
Index(['Name', 'Age'], dtype='object')


💾 To CSV / Read CSV
python
Copy code
df.to_csv("data.csv", index=False)
df2 = pd.read_csv("data.csv")
print(df2)
Output:

markdown
Copy code
    Name  Age
0  Alice   25
1    Bob   30
🧹 Drop Columns
python
Copy code
print(df.drop(columns=['Age']))
Output:

markdown
Copy code
     Name
r1  Alice
r2    Bob


📊 Describe & Info
python
Copy code
print(df.describe())
print(df.info())
Output:

pgsql
Copy code
             Age
count   2.000000
mean   27.500000
std     3.535534
min    25.000000
max    30.000000
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, r1 to r2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Name    2 non-null      object
 1   Age     2 non-null      int64
🔍 iloc and loc


python
Copy code
print(df.iloc[0])
print(df.loc['r2'])
Output:

pgsql
Copy code
Name    Alice
Age        25
Name: r1, dtype: object

Name     Bob
Age        30
Name: r2, dtype: object
🎯 Access Column / Indexing / Slicing
python
Copy code
print(df['Name'])       # Column
print(df[0:1])          # Slice row
print(df[['Name', 'Age']])  # Multi-column


🔢 columns.get_loc
python
Copy code
print(df.columns.get_loc('Age'))
Output:

Copy code
1
🎯 Fancy Indexing
python
Copy code
print(df[df['Age'] > 25])
Output:

markdown
Copy code
    Name  Age
r2   Bob   30


🔁 Unique
python
Copy code
print(df['Age'].unique())
Output:

csharp
Copy code
[25 30]


📊 Sorting
python
Copy code
print(df.sort_values(by='Age', ascending=False))
Output:

markdown
Copy code
    Name  Age
r2   Bob   30
r1 Alice   25


🔢 get_dummies
python
Copy code
df3 = pd.DataFrame({'Gender': ['M', 'F', 'F']})
print(pd.get_dummies(df3, columns=['Gender']))
Output:

nginx
Copy code
   Gender_F  Gender_M
0         0         1
1         1         0
2         1         0


🔁 groupby
python
Copy code
df = pd.DataFrame({
    'Team': ['A', 'A', 'B', 'B'],
    'Score': [10, 20, 15, 30]
})

print(df.groupby('Team')['Score'].mean())

# Iteration
for name, group in df.groupby('Team'):
    print(name)
    print(group)
Output:

less
Copy code
Team
A    15.0
B    22.5
Name: Score, dtype: float64
A
  Team  Score
0    A     10
1    A     20
B
  Team  Score
2    B     15
3    B     30


🔄 Pivot & Grouper
python
Copy code
df = pd.DataFrame({
    'Date': pd.to_datetime(['2023-01-01', '2023-01-01', '2023-01-02']),
    'Item': ['Apple', 'Banana', 'Apple'],
    'Sales': [100, 150, 200]
})

print(df.pivot(index='Date', columns='Item', values='Sales'))

# Grouper by day
print(df.groupby(pd.Grouper(key='Date', freq='D')).sum())
Output (pivot):

yaml
Copy code
Item        Apple  Banana
Date                     
2023-01-01    100     150
2023-01-02    200     NaN
🔗 Concat
python
Copy code
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'A': [3, 4]})
print(pd.concat([df1, df2], ignore_index=True))
Output:

css
Copy code
   A
0  1
1  2
2  3
3  4
🔗 Merge
python
Copy code
df1 = pd.DataFrame({'id': [1, 2], 'name': ['A', 'B']})
df2 = pd.DataFrame({'id': [2, 3], 'marks': [90, 80]})
print(pd.merge(df1, df2, on='id', how='inner'))
Output:

css
Copy code
   id name  marks
0   2    B     90
🧹 dropna, fillna, interpolate
python
Copy code
df = pd.DataFrame({'a': [1, None, 3]})
print(df.dropna())
print(df.fillna(0))
print(df.interpolate())
Output:

css
Copy code
     a
0  1.0
2  3.0

     a
0  1.0
1  0.0
2  3.0

     a
0  1.0
1  2.0
2  3.0


📈 Plotting (after installing matplotlib)

pip install matplotlib

import matplotlib.pyplot as plt
df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
df.plot(x='x', y='y')
plt.show()

In [77]:
import pandas as pd
import numpy as np

In [78]:
print(pd.__version__)
pd.DF=pd.DataFrame

# This line assigns an alias DF to pd.DataFrame, so you can
# use pd.DF instead of pd.DataFrame.

2.3.0


In [79]:
l1 = [10,20,30,40,50]
pds = pd.Series(l1,index = ['A','B','C','D','E'],dtype='int')
pds

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [80]:
type(pds)

pandas.core.series.Series

In [81]:
pds['B']

np.int64(20)

In [82]:
marks = {
    'Phy': [92,71,60,95,99],
    'Chem': [75,93,99,58,80],
    'Maths': [95,99,93,86,89]
}

In [83]:
marks

{'Phy': [92, 71, 60, 95, 99],
 'Chem': [75, 93, 99, 58, 80],
 'Maths': [95, 99, 93, 86, 89]}

In [84]:
df=pd.DF(marks,index=['A','B','C','D','E'])

In [85]:
df

Unnamed: 0,Phy,Chem,Maths
A,92,75,95
B,71,93,99
C,60,99,93
D,95,58,86
E,99,80,89


In [86]:
df=pd.DataFrame(marks)

In [87]:
df

Unnamed: 0,Phy,Chem,Maths
0,92,75,95
1,71,93,99
2,60,99,93
3,95,58,86
4,99,80,89


In [88]:
na=np.random.randint(1,100,(10,3))
na

array([[44, 14, 65],
       [67, 79, 23],
       [66, 24, 41],
       [11, 34, 81],
       [74,  8,  3],
       [10, 58, 48],
       [89, 18, 67],
       [71, 17, 86],
       [ 4, 29, 81],
       [99, 79, 27]], dtype=int32)

In [89]:
df2=pd.DataFrame(na)
df2

Unnamed: 0,0,1,2
0,44,14,65
1,67,79,23
2,66,24,41
3,11,34,81
4,74,8,3
5,10,58,48
6,89,18,67
7,71,17,86
8,4,29,81
9,99,79,27


In [90]:
df2=pd.DataFrame(na,columns=['phy', 'Chem','Maths'])

In [91]:
df2

Unnamed: 0,phy,Chem,Maths
0,44,14,65
1,67,79,23
2,66,24,41
3,11,34,81
4,74,8,3
5,10,58,48
6,89,18,67
7,71,17,86
8,4,29,81
9,99,79,27


In [92]:
df

Unnamed: 0,Phy,Chem,Maths
0,92,75,95
1,71,93,99
2,60,99,93
3,95,58,86
4,99,80,89


In [93]:
df.shape

(5, 3)

In [94]:
df2.head(2)

Unnamed: 0,phy,Chem,Maths
0,44,14,65
1,67,79,23


In [95]:
df2.head()

Unnamed: 0,phy,Chem,Maths
0,44,14,65
1,67,79,23
2,66,24,41
3,11,34,81
4,74,8,3


In [96]:
df2.tail(3)

Unnamed: 0,phy,Chem,Maths
7,71,17,86
8,4,29,81
9,99,79,27


In [97]:
df2.tail()

Unnamed: 0,phy,Chem,Maths
5,10,58,48
6,89,18,67
7,71,17,86
8,4,29,81
9,99,79,27


In [98]:
df.values
# The .values attribute returns the underlying NumPy array
# of a DataFrame — essentially stripping away the index and column labels.

array([[92, 75, 95],
       [71, 93, 99],
       [60, 99, 93],
       [95, 58, 86],
       [99, 80, 89]])

In [99]:
df.columns

Index(['Phy', 'Chem', 'Maths'], dtype='object')

In [100]:
df

Unnamed: 0,Phy,Chem,Maths
0,92,75,95
1,71,93,99
2,60,99,93
3,95,58,86
4,99,80,89


In [101]:
df.to_csv("Marks.csv")
# This line saves the DataFrame df to a CSV file named
#  Marks.csv in the current working directory.

In [102]:
ndf1=pd.read_csv('Marks.csv')
ndf1

Unnamed: 0.1,Unnamed: 0,Phy,Chem,Maths
0,0,92,75,95
1,1,71,93,99
2,2,60,99,93
3,3,95,58,86
4,4,99,80,89


In [103]:
ndf = pd.read_csv("Marks.csv",index_col='Unnamed: 0')
ndf

# ndf = pd.read_csv("Marks.csv", index_col='Unnamed: 0')
# This line reads the CSV file Marks.csv into a new DataFrame ndf, and uses the 'Unnamed: 0' column as the index.

# 🔍 Why 'Unnamed: 0'?
# When you save a DataFrame with index=True (which is the default) using:

# df.to_csv("Marks.csv")
# It adds the index as the first column, but without a column name. So, when you read it back, pandas names that column 'Unnamed: 0'.

# 

Unnamed: 0,Phy,Chem,Maths
0,92,75,95
1,71,93,99
2,60,99,93
3,95,58,86
4,99,80,89


In [104]:
df.to_csv("Marks.csv",sep='|',index = False)

# This line saves the DataFrame df to a CSV file named Marks.csv, using:

# | (pipe) as the separator instead of the default comma ,

# index=False to exclude the index column from the file



In [105]:
ndf = pd.read_csv("Marks.csv",sep='|')
ndf

Unnamed: 0,Phy,Chem,Maths
0,92,75,95
1,71,93,99
2,60,99,93
3,95,58,86
4,99,80,89


In [106]:
ndf.index = ['A','B','C','D','E']
ndf

Unnamed: 0,Phy,Chem,Maths
A,92,75,95
B,71,93,99
C,60,99,93
D,95,58,86
E,99,80,89


In [107]:
df.drop(columns=['Chem'])

# This line drops the column named 'Chem' from the DataFrame df.

# 🔍 Important Notes:
# Non-destructive by default: It returns a new DataFrame 
# without 'Chem', but df itself remains unchanged unless 
# you assign it or use inplace=True.

# Raises error if 'Chem' doesn't exist.

Unnamed: 0,Phy,Maths
0,92,95
1,71,99
2,60,93
3,95,86
4,99,89


In [108]:
df

Unnamed: 0,Phy,Chem,Maths
0,92,75,95
1,71,93,99
2,60,99,93
3,95,58,86
4,99,80,89


In [109]:
df2

Unnamed: 0,phy,Chem,Maths
0,44,14,65
1,67,79,23
2,66,24,41
3,11,34,81
4,74,8,3
5,10,58,48
6,89,18,67
7,71,17,86
8,4,29,81
9,99,79,27


In [110]:
df2.describe()

# 🔍 What it does:
# Generates summary statistics for numeric columns in DataFrame df2.

# Useful for quick inspection of data distribution, spread, and outliers.



Unnamed: 0,phy,Chem,Maths
count,10.0,10.0,10.0
mean,53.5,36.0,52.2
std,34.374571,26.524622,28.35411
min,4.0,8.0,3.0
25%,19.25,17.25,30.5
50%,66.5,26.5,56.5
75%,73.25,52.0,77.5
max,99.0,79.0,86.0


In [111]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   phy     10 non-null     int32
 1   Chem    10 non-null     int32
 2   Maths   10 non-null     int32
dtypes: int32(3)
memory usage: 252.0 bytes


In [112]:
df2.drop(columns=['phy'])

Unnamed: 0,Chem,Maths
0,14,65
1,79,23
2,24,41
3,34,81
4,8,3
5,58,48
6,18,67
7,17,86
8,29,81
9,79,27


In [113]:
df2

Unnamed: 0,phy,Chem,Maths
0,44,14,65
1,67,79,23
2,66,24,41
3,11,34,81
4,74,8,3
5,10,58,48
6,89,18,67
7,71,17,86
8,4,29,81
9,99,79,27


In [114]:
df2[['Chem','phy']]

Unnamed: 0,Chem,phy
0,14,44
1,79,67
2,24,66
3,34,11
4,8,74
5,58,10
6,18,89
7,17,71
8,29,4
9,79,99


In [115]:
df

Unnamed: 0,Phy,Chem,Maths
0,92,75,95
1,71,93,99
2,60,99,93
3,95,58,86
4,99,80,89


In [116]:
df.iloc[[2,3,4]]

Unnamed: 0,Phy,Chem,Maths
2,60,99,93
3,95,58,86
4,99,80,89


In [117]:
df.iloc[3,2]

# df.iloc[3, 2] is a Pandas command used to access a specific cell value in a DataFrame using integer-based indexing.

# 🔍 Explanation:
# df: Your DataFrame.

# .iloc: Used for integer-location based indexing.

# [3, 2]:

# 3 → refers to the 4th row (index starts from 0).

# 2 → refers to the 3rd column.



np.int64(86)

In [118]:
df.iloc[::2,::2]

Unnamed: 0,Phy,Maths
0,92,95
2,60,93
4,99,89


In [119]:
ndf = df[df['Chem'] >= 70]         # Step 1: Filter rows where Chemistry marks are >= 70
ndf = ndf[ndf['Phy'] >= 75]        # Step 2: Further filter those where Physics marks are >= 75
ndf                                # Step 3: Show the resulting filtered DataFrame


Unnamed: 0,Phy,Chem,Maths
0,92,75,95
4,99,80,89


In [120]:
df2

Unnamed: 0,phy,Chem,Maths
0,44,14,65
1,67,79,23
2,66,24,41
3,11,34,81
4,74,8,3
5,10,58,48
6,89,18,67
7,71,17,86
8,4,29,81
9,99,79,27


In [121]:
df2['Chem'].unique()

# ✅ What it does:
# Accesses the column Chem: df2['Chem']

# Applies .unique() to get all distinct values in that column (as a NumPy array).



array([14, 79, 24, 34,  8, 58, 18, 17, 29], dtype=int32)

In [122]:
df2['phy'].value_counts()

# This line returns the count of each unique value in the
#  'DBMS' column of the DataFrame df2, sorted in descending
#  order.



phy
44    1
67    1
66    1
11    1
74    1
10    1
89    1
71    1
4     1
99    1
Name: count, dtype: int64

In [123]:
a = np.array([1,3,1,2,4,1,3,2,4,1,2,3,1,4])
np.unique(a,return_counts=True)

# 🔍 Explanation:
# np.unique(a) → Returns sorted unique values from the array.

# return_counts=True → Also returns how many times each unique value appears.



(array([1, 2, 3, 4]), array([5, 3, 3, 3]))

In [124]:
df

Unnamed: 0,Phy,Chem,Maths
0,92,75,95
1,71,93,99
2,60,99,93
3,95,58,86
4,99,80,89


In [125]:
df.sort_values(by ='Chem',ascending=False)

Unnamed: 0,Phy,Chem,Maths
2,60,99,93
1,71,93,99
4,99,80,89
0,92,75,95
3,95,58,86


In [126]:
df=pd.DataFrame(df,index=['A','B','C','D','E'])
df

Unnamed: 0,Phy,Chem,Maths
A,,,
B,,,
C,,,
D,,,
E,,,


In [127]:
df2

Unnamed: 0,phy,Chem,Maths
0,44,14,65
1,67,79,23
2,66,24,41
3,11,34,81
4,74,8,3
5,10,58,48
6,89,18,67
7,71,17,86
8,4,29,81
9,99,79,27


In [128]:
array=np.random.randint(50,100,(5,3))
array

array([[67, 79, 71],
       [94, 85, 78],
       [59, 98, 50],
       [86, 81, 96],
       [72, 51, 74]], dtype=int32)

In [129]:
df1=pd.DataFrame(array)
df1

Unnamed: 0,0,1,2
0,67,79,71
1,94,85,78
2,59,98,50
3,86,81,96
4,72,51,74


In [130]:
dummy=pd.DataFrame(df1, columns=['DBMS','OS','CN'])

In [131]:
dummy

Unnamed: 0,DBMS,OS,CN
0,,,
1,,,
2,,,
3,,,
4,,,


In [135]:
import pandas as pd
import numpy as np

array = np.array([
    [85, 78, 92],
    [88, 80, 79],
    [75, 85, 89],
    [90, 82, 84]
])

dummy = pd.DataFrame(array, columns=['DBMS', 'OS', 'CN'])

print(dummy)


   DBMS  OS  CN
0    85  78  92
1    88  80  79
2    75  85  89
3    90  82  84


In [133]:
df2

Unnamed: 0,phy,Chem,Maths
0,44,14,65
1,67,79,23
2,66,24,41
3,11,34,81
4,74,8,3
5,10,58,48
6,89,18,67
7,71,17,86
8,4,29,81
9,99,79,27


In [136]:
pds = pd.Series(['A','B','C','B','A','B'])

In [137]:
pds

0    A
1    B
2    C
3    B
4    A
5    B
dtype: object

In [None]:
pdd = pd.get_dummies(pds,dtype='i1')
pdd

# ✅ What it does:
# pd.get_dummies(...): Converts categorical columns in a DataFrame or Series into one-hot encoded (dummy) variables.

# pds: Your original DataFrame or Series.

# dtype='i1': Sets the data type of the output dummy columns to int8 (i1 is NumPy shorthand).

Unnamed: 0,A,B,C
0,1,0,0
1,0,1,0
2,0,0,1
3,0,1,0
4,1,0,0
5,0,1,0


In [139]:
import pandas as pd

pds = pd.DataFrame({
    'Branch': ['CSE', 'ECE', 'ME', 'CSE', 'ME']
})

pdd = pd.get_dummies(pds, dtype='i1')
print(pdd)


   Branch_CSE  Branch_ECE  Branch_ME
0           1           0          0
1           0           1          0
2           0           0          1
3           1           0          0
4           0           0          1


In [144]:
data = pd.read_csv("weather_data_cities.csv")
data

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32,6,Rain
1,1/2/2017,new york,36,7,Sunny
2,1/3/2017,new york,28,12,Snow
3,1/4/2017,new york,33,7,Sunny
4,1/1/2017,mumbai,90,5,Sunny
5,1/2/2017,mumbai,85,12,Fog
6,1/3/2017,mumbai,87,15,Fog
7,1/4/2017,mumbai,92,5,Rain
8,1/1/2017,paris,45,20,Sunny
9,1/2/2017,paris,50,13,Cloudy


In [143]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("zaraavagyan/weathercsv")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/zaraavagyan/weathercsv?dataset_version_number=1...


100%|██████████| 10.4k/10.4k [00:00<00:00, 7.74MB/s]

Extracting files...
Path to dataset files: C:\Users\asus\.cache\kagglehub\datasets\zaraavagyan\weathercsv\versions\1





In [None]:
data[data['city'] == 'new york']['temperature'].mean()

# 🔍 Step-by-Step Breakdown:
# data['city'] == 'new york'
# → Creates a boolean mask where city is "new york".

# data[data['city'] == 'new york']
# → Filters the DataFrame to include only those rows.

# ['temperature']
# → Selects the 'temperature' column from the filtered data.

# .mean()
# → Computes the average temperature for New York.


np.float64(32.25)

In [146]:
data[data['city'] == 'mumbai']['temperature'].mean()

np.float64(88.5)

In [None]:
gdf = data.groupby(by = 'city')
for city,group in gdf:
    print(city)
    print(group)
    print()


# 🔍 Explanation:
# data.groupby(by='city'):

# Groups the DataFrame data based on unique values in the 'city' column.

# gdf becomes a GroupBy object — an iterable of (group_key, group_df) pairs.

# for city, group in gdf::

# Iterates over each group.

# city: the name of the city (the group key).

# group: a DataFrame containing all rows from data where 'city' == city.

# print(city):

# Prints the name of the current group (city).

# print(group):

# Prints the DataFrame of rows for that city.



mumbai
        day    city  temperature  windspeed  event
4  1/1/2017  mumbai           90          5  Sunny
5  1/2/2017  mumbai           85         12    Fog
6  1/3/2017  mumbai           87         15    Fog
7  1/4/2017  mumbai           92          5   Rain

new york
        day      city  temperature  windspeed  event
0  1/1/2017  new york           32          6   Rain
1  1/2/2017  new york           36          7  Sunny
2  1/3/2017  new york           28         12   Snow
3  1/4/2017  new york           33          7  Sunny

paris
         day   city  temperature  windspeed   event
8   1/1/2017  paris           45         20   Sunny
9   1/2/2017  paris           50         13  Cloudy
10  1/3/2017  paris           54          8  Cloudy
11  1/4/2017  paris           42         10  Cloudy



In [148]:
gdf['temperature'].mean()

city
mumbai      88.50
new york    32.25
paris       47.75
Name: temperature, dtype: float64

In [149]:
gdf = data.groupby(by = 'event')
gdf['windspeed'].mean()


event
Cloudy    10.333333
Fog       13.500000
Rain       5.500000
Snow      12.000000
Sunny      9.750000
Name: windspeed, dtype: float64

In [150]:
data

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32,6,Rain
1,1/2/2017,new york,36,7,Sunny
2,1/3/2017,new york,28,12,Snow
3,1/4/2017,new york,33,7,Sunny
4,1/1/2017,mumbai,90,5,Sunny
5,1/2/2017,mumbai,85,12,Fog
6,1/3/2017,mumbai,87,15,Fog
7,1/4/2017,mumbai,92,5,Rain
8,1/1/2017,paris,45,20,Sunny
9,1/2/2017,paris,50,13,Cloudy


In [None]:
data.pivot(columns='day',index='city')

# DataFrame.pivot(index, columns, values)
# index='city' → rows will be cities

# columns='day' → each unique day becomes a column

# values= (optional) → if not given, all columns except index and columns will become values



Unnamed: 0_level_0,temperature,temperature,temperature,temperature,windspeed,windspeed,windspeed,windspeed,event,event,event,event
day,1/1/2017,1/2/2017,1/3/2017,1/4/2017,1/1/2017,1/2/2017,1/3/2017,1/4/2017,1/1/2017,1/2/2017,1/3/2017,1/4/2017
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
mumbai,90,85,87,92,5,12,15,5,Sunny,Fog,Fog,Rain
new york,32,36,28,33,6,7,12,7,Rain,Sunny,Snow,Sunny
paris,45,50,54,42,20,13,8,10,Sunny,Cloudy,Cloudy,Cloudy


In [152]:
d1 = {
    'A' : [10,20,30,40],
    'B' : [3,4,5,6]
}
d2 = {
    'A' : [30,40,50,60],
    'C' : [3,4,5,6]
}

In [153]:
df1 = pd.DataFrame(d1)
df2 = pd.DataFrame(d2)

In [154]:
df1

Unnamed: 0,A,B
0,10,3
1,20,4
2,30,5
3,40,6


In [155]:
df2

Unnamed: 0,A,C
0,30,3
1,40,4
2,50,5
3,60,6


In [None]:
df3 = pd.concat([df1,df2],join = 'outer')
df3

# ✅ What this does:
# pd.concat([...]): Combines df1 and df2 vertically (row-wise by default).

# join='outer': Includes all columns from both DataFrames;
#  if a column is missing in one DataFrame, its values will be filled with NaN.

Unnamed: 0,A,B,C
0,10,3.0,
1,20,4.0,
2,30,5.0,
3,40,6.0,
0,30,,3.0
1,40,,4.0
2,50,,5.0
3,60,,6.0


In [None]:
mdf3 = pd.merge(df1,df2,how='outer')
mdf3

# What it does:
# pd.merge(...): Combines two DataFrames based on common columns or specified keys.

# how='outer': Performs a full outer join, keeping all rows from both DataFrames.

# Rows that don’t match on common keys will have NaN in unmatched columns.



Unnamed: 0,A,B,C
0,10,3.0,
1,20,4.0,
2,30,5.0,3.0
3,40,6.0,4.0
4,50,,5.0
5,60,,6.0


In [158]:
mdf3.dropna()

Unnamed: 0,A,B,C
2,30,5.0,3.0
3,40,6.0,4.0


In [159]:
mdf3.fillna(mdf3.mean())

Unnamed: 0,A,B,C
0,10,3.0,4.5
1,20,4.0,4.5
2,30,5.0,3.0
3,40,6.0,4.0
4,50,4.5,5.0
5,60,4.5,6.0


In [160]:
mdf3.mean()

A    35.0
B     4.5
C     4.5
dtype: float64

In [None]:
mdf3.interpolate('linear',limit_direction='both')

# ✅ Purpose:
# This line fills in missing (NaN) values in mdf3 using linear interpolation.

# 🔍 Explanation of Parameters:
# .interpolate('linear'): Performs linear interpolation, i.e., estimates missing values based on surrounding numeric values in the column.

# limit_direction='both':

# Allows interpolation in both forward and backward directions.

# This ensures that even NaNs at the start or end of the column can be filled.




Unnamed: 0,A,B,C
0,10,3.0,3.0
1,20,4.0,3.0
2,30,5.0,3.0
3,40,6.0,4.0
4,50,6.0,5.0
5,60,6.0,6.0
