# Pandas

![pandasimage](images/pandas.png)

### Core Data Structures:

Series: One-dimensional labeled array (like a column in Excel).

DataFrame: Two-dimensional labeled data structure (like a table/spreadsheet).

### => Handling Missing Data 
-- isna(), notna(), fillna(), dropna() 
### => Powerful Data Indexing & Slicing
-- .loc[] for label-based indexing | .iloc[] for integer-based indexing
### => Read/Write from Many Sources
-- read_csv(), to_csv() | read_excel(), to_excel() | read_sql() | read_json()
### => Data Aggregation & Grouping
-- df.groupby('column').sum() | df.pivot_table()
### => Merging and Joining
-- merge() | concat() | join()
### => Data Cleaning & Transformation
-- df.rename() | df.astype() | df.apply(), df.map() | df.replace()
### => Built-in Time Series Support
-- pd.to_datetime() | df.resample('M')
### => Highly Optimized & Fast
### => Built on top of NumPy, so operations are vectorized and fast.
### => Works great with matplotlib and seaborn
### => Name "Pandas" is derived from "Panel Data"




In [3]:
import pandas as pd
import numpy as np

# Series in pandas

- its a one dimension labelled array

In [23]:
s1 = pd.Series([10, 20, 30, 40, 50, 60])
print(s1)
print(type(s1))

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64
<class 'pandas.core.series.Series'>


In [7]:
s1 = pd.Series([10, 20, 30, 40, 50, 60])
print(s1)


0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64


In [11]:
s2 = pd.Series([10, 20, 30, 40, 50, 60], index=['a', 'b', 'c', 'd', 'e', 'f'], name="scores")
print(s2)

# When converting to DataFrame
# df = s2.to_frame()
# When merging Series or using DataFrames together — name becomes the column name.
# s2.name = "marks" # in case we want to change the name of the series

a    10
b    20
c    30
d    40
e    50
f    60
Name: scores, dtype: int64


In [15]:
# create series from dict

data = {'a' : 10, 'b' : 20, 'c' : 30, 'd' : 40}
s3 = pd.Series(data, name = "marks")
print(s3)

a    10
b    20
c    30
d    40
Name: marks, dtype: int64


In [17]:
# create series from dict

data = {'a' : 10, 'b' : "20", 'c' : 1.11, 'd' : True}
s4 = pd.Series(data)
print(s4)

a      10
b      20
c    1.11
d    True
dtype: object


In [19]:
# create series from dict

data = {'a' : 10, 'b' : "20", 1 : 1.11, 2 : True}
s5 = pd.Series(data)
print(s5)

a      10
b      20
1    1.11
2    True
dtype: object


In [20]:
# create series from np.array

np_array = np.array([10, 20, 30, 40, 50, 60])
s6 = pd.Series(np_array)
print(s6)

0    10
1    20
2    30
3    40
4    50
5    60
dtype: int64


In [34]:
# create series from dict

data = {'a' : 10, 'b' : 20, 'c' : 30, 'd' : 40}
s7 = pd.Series(data, name='scores')
print(s7)

a    10
b    20
c    30
d    40
Name: scores, dtype: int64


In [36]:
print(s7.index) # it is of type object - pandas series indexes
print(s7.values) # it is of type numpy array
print(type(s7.values))
print(s7.dtypes) # gives the type of content in the series
print(s7.name)
print(s7.shape)

Index(['a', 'b', 'c', 'd'], dtype='object')
[10 20 30 40]
<class 'numpy.ndarray'>
int64
scores
(4,)


# Series indexing and slicing

In [39]:
print(s7, end = "\n\n")
print(s7.values[0]) # positional index

a    10
b    20
c    30
d    40
Name: scores, dtype: int64

10


In [53]:
s7 = pd.Series([10, 20, 30])
print(s7.loc[0])
print(s7.loc[[0,1]])
print("=" * 50)

s7 = pd.Series([10, 20, 30], index=['a','b','c'])
# print(s7.loc[0]) # this will throw an error as indexes are changed
print(s7.loc['a']) # indexing
print(s7.loc[['a', 'b']])
print("=" * 50)

s7 = pd.Series([10, 20, 30], index=['a','b','c'])
print(s7.iloc[0]) # Positional indexing
print(s7.iloc[[0, 1]])
print(s7.iloc[0:2]) # Positional indexing (exclusive)
print("=" * 50)


10
0    10
1    20
dtype: int64
10
a    10
b    20
dtype: int64
10
a    10
b    20
dtype: int64
a    10
b    20
dtype: int64


In [54]:

data = {'a' : 10, 'b' : 20, 'c' : 30, 'd' : 40, 0 : 100, 1 : 101, 2 : 400, 6:700}
s9 = pd.Series(data, name='scores')
print(s9)

a     10
b     20
c     30
d     40
0    100
1    101
2    400
6    700
Name: scores, dtype: int64


In [55]:
s9.loc[0:2] # inclusive

0    100
1    101
2    400
Name: scores, dtype: int64

In [56]:
# Loc is Label-based indexing (Index labels), Slices are inclusive
# iloc is Integer-based indexing (Positions), Slices are exclusive of end

# Slicing
# - loc [start, end] : end is inclusive
# - iloc [start, end]  :end is exclusive

# DataFrame

In [57]:
data = {
    'Name' : ['Ananth', 'Pushpak', 'Nikhil'],
    'Age' : [24, 26, 28],
    'City' : ['Bengaluru', 'Hyderabad', 'Delhi']
}
# a dict with string as keys and list as values

df = pd.DataFrame(data)
print(df)

      Name  Age       City
0   Ananth   24  Bengaluru
1  Pushpak   26  Hyderabad
2   Nikhil   28      Delhi


In [64]:
data_1 = [
    {'Name' : 'Ananth', 'Age':'24', 'City':'Bengaluru'},
    {'Name' : 'Pushpak', 'Age':'26', 'City':'Hyderabad'},
    {'Name' : 'Nikhil', 'Age':'28', 'City':'Delhi'}
]

# a list contaning dicts

df_1 = pd.DataFrame(data_1)
print(df_1)

      Name Age       City
0   Ananth  24  Bengaluru
1  Pushpak  26  Hyderabad
2   Nikhil  28      Delhi


In [59]:
arr_data = np.array([[1,2],[3,4], [5,6]])
# a 2D numpy array
df_2 = pd.DataFrame(arr_data)
print(df_2)

   0  1
0  1  2
1  3  4
2  5  6


In [60]:
df_2 = pd.DataFrame(arr_data, columns=['A', 'B']) # changing the column name
print(df_2)

   A  B
0  1  2
1  3  4
2  5  6


# Accessing the values

In [67]:
data = {
    'Name' : ['Ananth', 'Pushpak', 'Nikhil'],
    'Age' : [24, 26, 28],
    'City' : ['Bengaluru', 'Hyderabad', 'Delhi']
}

df_4 = pd.DataFrame(data)
print(df)
print(type(df_4))

      Name  Age       City
0   Ananth   24  Bengaluru
1  Pushpak   26  Hyderabad
2   Nikhil   28      Delhi
<class 'pandas.core.frame.DataFrame'>


In [72]:
print(df_4['Name'])
print("="*40)
print(df_4['Name'][[0,1]])
print("="*40)
print(df_4[['Name', 'Age']])
print("="*40)
print(df.loc[0])
print("="*40)
print(df.loc[0:1]) # Inclusive
print("="*40)
print(df.iloc[0])
print("="*40)
print(df.iloc[0:1]) # Exclusive


0     Ananth
1    Pushpak
2     Nikhil
Name: Name, dtype: object
0     Ananth
1    Pushpak
Name: Name, dtype: object
      Name  Age
0   Ananth   24
1  Pushpak   26
2   Nikhil   28
Name       Ananth
Age            24
City    Bengaluru
Name: 0, dtype: object
      Name  Age       City
0   Ananth   24  Bengaluru
1  Pushpak   26  Hyderabad
Name       Ananth
Age            24
City    Bengaluru
Name: 0, dtype: object
     Name  Age       City
0  Ananth   24  Bengaluru


# Working with real dataset

In [73]:

# https://github.com/datasciencedojo/datasets
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

# https://github.com/mwaskom/seaborn-data/tree/master

iris_dataset_github_url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/iris.csv"

# kaggle
kaggle_link = "https://www.kaggle.com/datasets/yasserh/titanic-dataset/data"

In [151]:
df = pd.read_csv(titanic_dataset_github_url) # raw data

### iteration in pandas 
- also done in mid. (we can pass iterrows and itertuples for now)

In [162]:

for index, row in df.iterrows():
    print(f"{index} name : {row.Name}")

0 name : Braund, Mr. Owen Harris
1 name : Cumings, Mrs. John Bradley (Florence Briggs Thayer)
2 name : Heikkinen, Miss. Laina
3 name : Futrelle, Mrs. Jacques Heath (Lily May Peel)
4 name : Allen, Mr. William Henry
5 name : Moran, Mr. James
6 name : McCarthy, Mr. Timothy J
7 name : Palsson, Master. Gosta Leonard
8 name : Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9 name : Nasser, Mrs. Nicholas (Adele Achem)
10 name : Sandstrom, Miss. Marguerite Rut
11 name : Bonnell, Miss. Elizabeth
12 name : Saundercock, Mr. William Henry
13 name : Andersson, Mr. Anders Johan
14 name : Vestrom, Miss. Hulda Amanda Adolfina
15 name : Hewlett, Mrs. (Mary D Kingcome) 
16 name : Rice, Master. Eugene
17 name : Williams, Mr. Charles Eugene
18 name : Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)
19 name : Masselmani, Mrs. Fatima
20 name : Fynney, Mr. Joseph J
21 name : Beesley, Mr. Lawrence
22 name : McGowan, Miss. Anna "Annie"
23 name : Sloper, Mr. William Thompson
24 name : Palsson, Miss. To

In [None]:
for row in df.itertuples(): # we can keep index = False, to discard the index
    print(f"{row}")

Pandas(Index=0, PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=nan, Embarked='S')
Pandas(Index=1, PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C')
Pandas(Index=2, PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=nan, Embarked='S')
Pandas(Index=3, PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S')
Pandas(Index=4, PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=nan, Embarked='S')
Pandas(Index=5, PassengerId=6, 

In [159]:
for row in df.itertuples():
    print(f"name : {row.Name}  || he surived ? : {row.Survived}")

name : Braund, Mr. Owen Harris  || he surived ? : 0
name : Cumings, Mrs. John Bradley (Florence Briggs Thayer)  || he surived ? : 1
name : Heikkinen, Miss. Laina  || he surived ? : 1
name : Futrelle, Mrs. Jacques Heath (Lily May Peel)  || he surived ? : 1
name : Allen, Mr. William Henry  || he surived ? : 0
name : Moran, Mr. James  || he surived ? : 0
name : McCarthy, Mr. Timothy J  || he surived ? : 0
name : Palsson, Master. Gosta Leonard  || he surived ? : 0
name : Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  || he surived ? : 1
name : Nasser, Mrs. Nicholas (Adele Achem)  || he surived ? : 1
name : Sandstrom, Miss. Marguerite Rut  || he surived ? : 1
name : Bonnell, Miss. Elizabeth  || he surived ? : 1
name : Saundercock, Mr. William Henry  || he surived ? : 0
name : Andersson, Mr. Anders Johan  || he surived ? : 0
name : Vestrom, Miss. Hulda Amanda Adolfina  || he surived ? : 0
name : Hewlett, Mrs. (Mary D Kingcome)   || he surived ? : 1
name : Rice, Master. Eugene  || he suri

In [75]:
# preview of the dataframe
df.head(7) # by default if we dont mention - its 5

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S


In [76]:
df.tail(7)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [77]:
print(df.tail())

     PassengerId  Survived  Pclass                                      Name  \
886          887         0       2                     Montvila, Rev. Juozas   
887          888         1       1              Graham, Miss. Margaret Edith   
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   
889          890         1       1                     Behr, Mr. Karl Howell   
890          891         0       3                       Dooley, Mr. Patrick   

        Sex   Age  SibSp  Parch      Ticket   Fare Cabin Embarked  
886    male  27.0      0      0      211536  13.00   NaN        S  
887  female  19.0      0      0      112053  30.00   B42        S  
888  female   NaN      1      2  W./C. 6607  23.45   NaN        S  
889    male  26.0      0      0      111369  30.00  C148        C  
890    male  32.0      0      0      370376   7.75   NaN        Q  


In [80]:
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [81]:
df[['Name','Age']]

Unnamed: 0,Name,Age
0,"Braund, Mr. Owen Harris",22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
4,"Allen, Mr. William Henry",35.0
...,...,...
886,"Montvila, Rev. Juozas",27.0
887,"Graham, Miss. Margaret Edith",19.0
888,"Johnston, Miss. Catherine Helen ""Carrie""",
889,"Behr, Mr. Karl Howell",26.0


In [82]:
df.loc[0] # first row

PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                               22.0
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

In [None]:
df.loc[0, 'Name'] # name of the passenger at index 0
# df['Name'][0] # can also be done like this

'Braund, Mr. Owen Harris'

In [88]:
df.loc[0:4] # shows rows data from 0 to 4(inclusive)
# df.loc[0:4,"Name"] # shows names from 0 to 4(inclusive)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [90]:
df.iloc[0] # 1st row

PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                               22.0
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

In [92]:
df.iloc[0:4] # exclusive of the end index

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [93]:
df.iloc[0, 3] # position index

'Braund, Mr. Owen Harris'

# Conditional selection

In [97]:
# df[condition]
older_passengers = df[df['Age'] > 70] # get all the rows where person age is greater than 70
older_passengers.head() 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.775,,S


In [102]:
# get all the females who surivived
df[(df['Sex'] == 'female') & df['Survived'] == 1]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
874,875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28.0,1,0,P/PP 3381,24.0000,,C
875,876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15.0,0,0,2667,7.2250,,C
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S


In [103]:
# get all the females who surivived
female_surviors_subset = df[(df['Sex'] == 'female') & df['Survived'] == 1][['Name', 'Age']]
female_surviors_subset.head()

Unnamed: 0,Name,Age
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0
9,"Nasser, Mrs. Nicholas (Adele Achem)",14.0


In [104]:
# total number of survivors

survived = len(df[df['Survived'] == 1])
notsurvived = len(df[df['Survived'] == 0])
print(f"people who have not survived {notsurvived}")
print(f"people who have survived {survived}")
print(notsurvived + survived)

people who have not survived 549
people who have survived 342
891


In [106]:
female_survivors_subset_loc = df.loc[(df['Sex'] == 'female') & (df['Survived'] == 1), ['Name', 'Age']]
female_survivors_subset_loc.head()

Unnamed: 0,Name,Age
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0
9,"Nasser, Mrs. Nicholas (Adele Achem)",14.0


In [110]:
female_survivors_subset_loc = df[(df['Sex'] == 'female') & (df['Survived'] == 1)][['Name', 'Age']]
female_survivors_subset_loc.head()

Unnamed: 0,Name,Age
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0
9,"Nasser, Mrs. Nicholas (Adele Achem)",14.0


# Slicing Rows and Columns

In [113]:
df.loc[0:4,  ['Name', 'Age']]

Unnamed: 0,Name,Age
0,"Braund, Mr. Owen Harris",22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0
2,"Heikkinen, Miss. Laina",26.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0
4,"Allen, Mr. William Henry",35.0


In [None]:
# I want first 3 rows and first 2 columns

df.iloc[0:3, 0:2]

Unnamed: 0,PassengerId,Survived
0,1,0
1,2,1
2,3,1


In [117]:
# First 3 rows and selected columns by index
# --> column 1,3

df.iloc[0:3, [1, 3]]

Unnamed: 0,Survived,Name
0,0,"Braund, Mr. Owen Harris"
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,1,"Heikkinen, Miss. Laina"


In [118]:
df.loc[0:2, ['Name', 'Survived']]

Unnamed: 0,Name,Survived
0,"Braund, Mr. Owen Harris",0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1
2,"Heikkinen, Miss. Laina",1


In [119]:
subset = df[df['Sex'] == 'male'] # might create a view  # filtered rows where 'Sex' is male
subset['Age'] = subset['Age'] + 1  # trying to increment Age by 1

print(subset.head(2))
print(df.head(2))

# Warning You Might See: SettingWithCopyWarning:
# Pandas doesn't know for sure whether you're modifying: the original df, or copy subset
# may return either: A view (shared memory),or a copy (separate memory)

# Correct Way (safe and recommended): Use .copy() to explicitly make a copy

   PassengerId  Survived  Pclass                      Name   Sex   Age  SibSp  \
0            1         0       3   Braund, Mr. Owen Harris  male  23.0      1   
4            5         0       3  Allen, Mr. William Henry  male  36.0      0   

   Parch     Ticket  Fare Cabin Embarked  
0      0  A/5 21171  7.25   NaN        S  
4      0     373450  8.05   NaN        S  
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   

   Parch     Ticket     Fare Cabin Embarked  
0      0  A/5 21171   7.2500   NaN        S  
1      0   PC 17599  71.2833   C85        C  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['Age'] = subset['Age'] + 1  # trying to increment Age by 1


In [120]:
subset = df[df['Sex'] == 'male'].copy()
subset['Age'] += 1
print(subset)

     PassengerId  Survived  Pclass                            Name   Sex  \
0              1         0       3         Braund, Mr. Owen Harris  male   
4              5         0       3        Allen, Mr. William Henry  male   
5              6         0       3                Moran, Mr. James  male   
6              7         0       1         McCarthy, Mr. Timothy J  male   
7              8         0       3  Palsson, Master. Gosta Leonard  male   
..           ...       ...     ...                             ...   ...   
883          884         0       2   Banfield, Mr. Frederick James  male   
884          885         0       3          Sutehall, Mr. Henry Jr  male   
886          887         0       2           Montvila, Rev. Juozas  male   
889          890         1       1           Behr, Mr. Karl Howell  male   
890          891         0       3             Dooley, Mr. Patrick  male   

      Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    23.0      1   

In [121]:
df_by_name = df.set_index('Name') # moves the 'Name' column into the index of the DataFrame.
df_by_name.head()

# By default, set_index() returns a new DataFrame (doesn't change df itself).

# If you want to modify in-place, use: df.set_index('Name', inplace=True)



Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.05,,S


In [122]:
print(df_by_name.loc['Allen, Mr. William Henry'])

PassengerId         5
Survived            0
Pclass              3
Sex              male
Age              35.0
SibSp               0
Parch               0
Ticket         373450
Fare             8.05
Cabin             NaN
Embarked            S
Name: Allen, Mr. William Henry, dtype: object


In [123]:
# moves the index back into a regular column, and 
# restores the default integer index (0, 1, 2, ...).
# The DataFrame now has a default RangeIndex again
df_by_name = df_by_name.reset_index()
df_by_name.head()

Unnamed: 0,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.05,,S


In [124]:
df_by_name.reset_index(inplace=True)
df_by_name.head()

Unnamed: 0,index,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,4,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.05,,S


In [125]:
df_by_name = df.set_index('Name')
df_by_name = df_by_name.reset_index()
df_by_name.head()

Unnamed: 0,Name,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.05,,S


# Reindex

In [126]:
sales_jan = pd.DataFrame({
    'Sales': [250, 400, 300]
}, index=['Store_A', 'Store_B', 'Store_C'])

sales_jan

Unnamed: 0,Sales
Store_A,250
Store_B,400
Store_C,300


In [127]:
sales_feb = pd.DataFrame({
    'Sales': [260, 310]
}, index=['Store_A', 'Store_C'])

sales_feb

Unnamed: 0,Sales
Store_A,260
Store_C,310


In [128]:
common_index = sales_jan.index
sales_feb_aligned = sales_feb.reindex(common_index)
sales_feb_aligned

Unnamed: 0,Sales
Store_A,260.0
Store_B,
Store_C,310.0


In [129]:
sales_diff = sales_feb_aligned['Sales'] - sales_jan['Sales']
sales_diff

Store_A    10.0
Store_B     NaN
Store_C    10.0
Name: Sales, dtype: float64

In [130]:
# fill_value with reindex
sales_feb_aligned = sales_feb.reindex(common_index, fill_value=0)
sales_feb_aligned

Unnamed: 0,Sales
Store_A,260
Store_B,0
Store_C,310


In [131]:
sales_diff = sales_feb_aligned['Sales'] - sales_jan['Sales']
sales_diff

Store_A     10
Store_B   -400
Store_C     10
Name: Sales, dtype: int64

In [132]:
sales_feb_new = pd.DataFrame({
    'Sales': [260, 310, 100]
}, index=['Store_A', 'Store_C', 'Store_D'])

sales_feb_new

Unnamed: 0,Sales
Store_A,260
Store_C,310
Store_D,100


In [133]:
common_index = list(set(list(sales_jan.index) + list(sales_feb_new.index)))
common_index

['Store_D', 'Store_B', 'Store_A', 'Store_C']

In [134]:
sales_jan_aligned = sales_jan.reindex(common_index, fill_value=0)
sales_jan_aligned

Unnamed: 0,Sales
Store_D,0
Store_B,400
Store_A,250
Store_C,300


In [135]:
sales_feb_new_aligned = sales_feb_new.reindex(common_index, fill_value=0)
sales_feb_new_aligned

Unnamed: 0,Sales
Store_D,100
Store_B,0
Store_A,260
Store_C,310


In [136]:
to_check = 'Store_C'
print(sales_feb_new_aligned.loc[to_check])
print(sales_jan_aligned.loc[to_check])

Sales    310
Name: Store_C, dtype: int64
Sales    300
Name: Store_C, dtype: int64


# For column

In [137]:
source_a = pd.DataFrame({
    'Product': ['Laptop', 'Tablet', 'Phone'],
    'Price': [1000, 500, 800],
    'Stock': [30, 50, 100]
})
source_a

Unnamed: 0,Product,Price,Stock
0,Laptop,1000,30
1,Tablet,500,50
2,Phone,800,100


In [138]:
source_b = pd.DataFrame({
    'Price': [980, 520, 810],
    'Product': ['Laptop', 'Tablet', 'Phone']
})
source_b

Unnamed: 0,Price,Product
0,980,Laptop
1,520,Tablet
2,810,Phone


In [143]:
aligned_b = source_b.reindex(columns=source_a.columns, fill_value = 0)
aligned_b

Unnamed: 0,Product,Price,Stock
0,Laptop,980,0
1,Tablet,520,0
2,Phone,810,0


# iteration in pandas

In [164]:
df = pd.DataFrame({
    'Name' : ['heartine', 'gagan', 'gavin'],
    'Age' : [25, 20, 23],
    'City' : ['Delhi', 'Mumbai', 'Chennai'],
})
df

Unnamed: 0,Name,Age,City
0,heartine,25,Delhi
1,gagan,20,Mumbai
2,gavin,23,Chennai


In [165]:
# df.iterrows()
# slow for large DataFrames
# For performance-critical loops, use itertuples() (faster) or vectorized operations.


for index, row in df.iterrows():
    print(f"{index}: {row['Name']} lives in {row['City']} and is {row['Age']} years old.")

0: heartine lives in Delhi and is 25 years old.
1: gagan lives in Mumbai and is 20 years old.
2: gavin lives in Chennai and is 23 years old.


In [166]:
# df.itertuples()
for row in df.itertuples():
    print(f"{row.Name} lives in {row.City}, and is {row.Age} years old.", end='\n\n')

heartine lives in Delhi, and is 25 years old.

gagan lives in Mumbai, and is 20 years old.

gavin lives in Chennai, and is 23 years old.



In [167]:
for col in df.columns:
    print(f"Column: {col}")
    print(df[col].values, end='\n\n')

Column: Name
['heartine' 'gagan' 'gavin']

Column: Age
[25 20 23]

Column: City
['Delhi' 'Mumbai' 'Chennai']



# Condition based column creation using custom logic

In [168]:
df.head()

Unnamed: 0,Name,Age,City
0,heartine,25,Delhi
1,gagan,20,Mumbai
2,gavin,23,Chennai


In [171]:
# if age >=30, senior else young

val = []
for row in df.itertuples():
    if row.Age >= 30:
        val.append("Senior")
    else:
        val.append("Junior")


df["Seniority"] = val
df

Unnamed: 0,Name,Age,City,Seniority
0,heartine,25,Delhi,Junior
1,gagan,20,Mumbai,Junior
2,gavin,23,Chennai,Junior


# vectorized using apply

In [172]:
df = pd.DataFrame({
    'Name' : ['heartine', 'gagan', 'gavin'],
    'Age' : [25, 20, 23],
    'City' : ['Delhi', 'Mumbai', 'Chennai'],
})
df

Unnamed: 0,Name,Age,City
0,heartine,25,Delhi
1,gagan,20,Mumbai
2,gavin,23,Chennai


In [174]:
df['Categories'] = df['Age'].apply(lambda age : 'young' if age < 30 else 'senior')
df
# apply() — Fastest (and most Pythonic)
# not as fast as true NumPy vectorization (but very close)
# iterrows < itertuples < apply

Unnamed: 0,Name,Age,City,Categories
0,heartine,25,Delhi,young
1,gagan,20,Mumbai,young
2,gavin,23,Chennai,young


# Sort

In [175]:
# 1. sort_values --> data values
# 2. sort_index --> rows / column

In [176]:
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

df = pd.read_csv(titanic_dataset_github_url)

In [177]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [178]:
sorted_df = df.sort_values(by='Age')
sorted_df[['Name', 'Age']].head(100)

Unnamed: 0,Name,Age
803,"Thomas, Master. Assad Alexander",0.42
755,"Hamalainen, Master. Viljo",0.67
644,"Baclini, Miss. Eugenie",0.75
469,"Baclini, Miss. Helene Barbara",0.75
78,"Caldwell, Master. Alden Gates",0.83
...,...,...
220,"Sunderland, Mr. Victor Francis",16.00
791,"Gaskell, Mr. Alfred",16.00
86,"Ford, Mr. William Neal",16.00
71,"Goodwin, Miss. Lillian Amy",16.00


In [179]:
sorted_df.loc[0:100, ['Name', 'Age']]

Unnamed: 0,Name,Age
0,"Braund, Mr. Owen Harris",22.0
289,"Connolly, Miss. Kate",22.0
323,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harb...",22.0
243,"Maenpaa, Mr. Matti Alexanteri",22.0
521,"Vovk, Mr. Janko",22.0
...,...,...
874,"Abelson, Mrs. Samuel (Hannah Wizosky)",28.0
399,"Trout, Mrs. William H (Jessie L)",28.0
392,"Gustafsson, Mr. Johan Birger",28.0
635,"Davis, Miss. Mary",28.0


In [180]:
print(df['Age'].dtype)

float64


In [None]:
# df['Age'] = df['Age'].astype('int64')

# convert a column to an integer type (int64) but there are NaN values in it, and integers can't represent NaN.
# Would give an error - IntCastingNaNError

In [None]:
# astype changes the data type of the coulumn.
df['Age'] = df['Age'].fillna(0).astype('int64')
print(df['Age'].dtype)

int64


In [186]:
# sort by multiple columns
df_sorted = df.sort_values(by=['Pclass', 'Fare']) # ascending order
# Sorts the DataFrame first by Pclass, then (within each Pclass) by Fare
df_sorted.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
263,264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0.0,B94,S
633,634,0,1,"Parr, Mr. William Henry Marsh",male,0,0,0,112052,0.0,,S
806,807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0.0,A36,S
815,816,0,1,"Fry, Mr. Richard",male,0,0,0,112058,0.0,B102,S
822,823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0.0,,S


In [187]:
df_sorted = df.sort_values(by=['Pclass', 'Fare'], ascending=[True, False])
# Pclass will be sorted in asc and fare in decending
df_sorted.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
258,259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C
679,680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C
737,738,1,1,"Lesurer, Mr. Gustave J",male,35,0,0,PC 17755,512.3292,B101,C
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263.0,C23 C25 C27,S
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263.0,C23 C25 C27,S


In [188]:
# df_sorted.to_csv("a.csv")

# sort_index

In [194]:
df_sort_2 = df.set_index('PassengerId')

In [195]:
df_sorted_by_index = df_sort_2.sort_index(axis=0)
# axis=0 means you’re sorting rows, based on the row index labels.
df_sorted_by_index.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [196]:
df_sorted_by_index = df_sort_2.sort_index(axis=0, ascending=False)
df_sorted_by_index.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q
890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30.0,C148,C
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,0,1,2,W./C. 6607,23.45,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30.0,B42,S
887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13.0,,S


In [None]:
# df_sort_2.sort_index(axis=0, ascending=False,inplace=True)
# axis=0 → Sort rows (default)
# ascending=False → Sort in descending order
# inplace=True → Make the changes directly in df_sort_2 without needing to assign to a new variable

# sorting columns alphabets

In [197]:
df_sorted_cols = df.sort_index(axis=1)
# sort_index(axis=1) → Sorts the columns 
# alphabetically by their names (because axis=1 means columns).
df_sorted_cols.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171
1,38,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599
2,26,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282
3,35,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803
4,35,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0,373450


# Changing Column order in Pandas

In [198]:
titanic_dataset_github_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

df = pd.read_csv(titanic_dataset_github_url)

In [202]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [204]:
new_order = ['Name', 'Sex', 'Age', 'Pclass', 'Fare', 'Survived']
df_reordered = df[new_order]
print(df_reordered.head())

                                                Name     Sex   Age  Pclass  \
0                            Braund, Mr. Owen Harris    male  22.0       3   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0       1   
2                             Heikkinen, Miss. Laina  female  26.0       3   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0       1   
4                           Allen, Mr. William Henry    male  35.0       3   

      Fare  Survived  
0   7.2500         0  
1  71.2833         1  
2   7.9250         1  
3  53.1000         1  
4   8.0500         0  


In [205]:
sorted(df.columns)

['Age',
 'Cabin',
 'Embarked',
 'Fare',
 'Name',
 'Parch',
 'PassengerId',
 'Pclass',
 'Sex',
 'SibSp',
 'Survived',
 'Ticket']

In [206]:
df_reordered = df[sorted(df.columns)]
df_reordered.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0,373450


# working with text data in pandas

In [207]:
print(df[['Name', 'Sex', 'Cabin']].head(3))

                                                Name     Sex Cabin
0                            Braund, Mr. Owen Harris    male   NaN
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   C85
2                             Heikkinen, Miss. Laina  female   NaN


In [209]:
df['Name'].str.lower()
# df['Name'].str.upper()
# df['Name'].str.capitalize()

0                                braund, mr. owen harris
1      cumings, mrs. john bradley (florence briggs th...
2                                 heikkinen, miss. laina
3           futrelle, mrs. jacques heath (lily may peel)
4                               allen, mr. william henry
                             ...                        
886                                montvila, rev. juozas
887                         graham, miss. margaret edith
888             johnston, miss. catherine helen "carrie"
889                                behr, mr. karl howell
890                                  dooley, mr. patrick
Name: Name, Length: 891, dtype: object

In [210]:
df['Name'].str.len()

0      23
1      51
2      22
3      44
4      24
       ..
886    21
887    28
888    40
889    21
890    19
Name: Name, Length: 891, dtype: int64

In [211]:
df['Name'].str.replace("Mr.", "Sir", regex=False).head()
# regular expressions (regex) regex=False tells Pandas to treat "Mr." as a literal string, not a regular expression
# If you forget regex=False, Pandas might misinterpret "." as "any character"
# it could match "MrX", "Mra", "Mr " 
# because "." is a regex wildcard.

0                              Braund, Sir Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Sir William Henry
Name: Name, dtype: object

In [212]:
df['Name'].str.replace(r"\(.*\)", "", regex=True).head()
# r"\(.*\)"
# Part	        Meaning
# r"..."	Raw string (so \ is not interpreted by Python)
# \(	Matches a literal opening parenthesis (
# .*	Matches any characters (except newlines), zero or more times
# \)	Matches a literal closing parenthesis )
# So \(.*\) matches: " (Mr.)" / " (CEO)" / " (Head of HR)"
# r"\s*\(.*\)"	Also removes spaces before ()

0          Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley 
2           Heikkinen, Miss. Laina
3    Futrelle, Mrs. Jacques Heath 
4         Allen, Mr. William Henry
Name: Name, dtype: object

In [213]:
df['Title'] = df['Name'].str.extract(r'(\(.*\))', expand=False)
# expand=False returns a Series instead of a DataFrame (simpler for one column).
df['Title']

0                           NaN
1      (Florence Briggs Thayer)
2                           NaN
3               (Lily May Peel)
4                           NaN
                 ...           
886                         NaN
887                         NaN
888                         NaN
889                         NaN
890                         NaN
Name: Title, Length: 891, dtype: object

# Split and Access Elements

In [215]:
df['Name'].str.split(",", expand=True)[0]

0         Braund
1        Cumings
2      Heikkinen
3       Futrelle
4          Allen
         ...    
886     Montvila
887       Graham
888     Johnston
889         Behr
890       Dooley
Name: 0, Length: 891, dtype: object

In [217]:
df['Name'].str.strip() 
# df['Name'].str.lstrip()
# df['Name'].str.rstrip()

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

# Find position of substring

In [219]:
df['Name'].str.find("Mr")

0       8
1       9
2      -1
3      10
4       7
       ..
886    -1
887    -1
888    -1
889     6
890     8
Name: Name, Length: 891, dtype: int64

In [222]:
# Count passengers with “Miss” in name
count = df['Name'].str.contains("Miss").sum()
print("Number of 'Miss':", count)

Number of 'Miss': 182


In [225]:
df = pd.read_csv(titanic_dataset_github_url)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [227]:
data = {
    'Name' : [
        'Monal K', 'Bhuvika', 'Niranjan', 'riyan'
    ],
    'Email' : [
        'm@gmail.com', 'b@gmail.com', 'n@gmail.com', 'r@gmail.com'
    ]
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Email
0,Monal K,m@gmail.com
1,Bhuvika,b@gmail.com
2,Niranjan,n@gmail.com
3,riyan,r@gmail.com


In [229]:
df['Name'].str.replace("K", "Kumar", regex=False)

0    Monal Kumar
1        Bhuvika
2       Niranjan
3          riyan
Name: Name, dtype: object

In [230]:
df['Name'].str.pad(width=20, side='left', fillchar='-')
# df['Name'].str.pad(width=20, side='right', fillchar='-')

0    -------------Monal K
1    -------------Bhuvika
2    ------------Niranjan
3    ---------------riyan
Name: Name, dtype: object

# Options and customizations

In [232]:
df = pd.read_csv(titanic_dataset_github_url)

In [239]:
pd.set_option('display.max_rows', 3)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [240]:
pd.set_option('display.max_columns', None)
# display all columns when printing a DataFrame — instead of truncating them with ... when there are too many.
# pd.reset_option('display.max_columns') # to reset
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [None]:
pd.set_option('display.max_rows', 30)
df[['Fare', 'Age']].head() # because of head only 5 rows displayed

Unnamed: 0,Fare,Age
0,7.25,22.0
1,71.2833,38.0
2,7.925,26.0
3,53.1,35.0
4,8.05,35.0


In [242]:
# Check Current Value of an Option
pd.get_option('display.max_rows')

30

In [243]:
# Temporarily Set Option (Context Manager)
with pd.option_context('display.max_rows', 3):
    print(df)

     PassengerId  Survived  Pclass                     Name   Sex   Age  \
0              1         0       3  Braund, Mr. Owen Harris  male  22.0   
..           ...       ...     ...                      ...   ...   ...   
890          891         0       3      Dooley, Mr. Patrick  male  32.0   

     SibSp  Parch     Ticket  Fare Cabin Embarked  
0        1      0  A/5 21171  7.25   NaN        S  
..     ...    ...        ...   ...   ...      ...  
890      0      0     370376  7.75   NaN        Q  

[891 rows x 12 columns]


In [244]:
pd.reset_option('display.max_rows') # Resetting to Default
pd.get_option('display.max_rows')

60

In [253]:
# You can also reset all options:
pd.reset_option('all') # it will throw an error / Warning

Matplotlib is building the font cache; this may take a moment.


In [254]:
import warnings


with warnings.catch_warnings():
    warnings.simplefilter('ignore', FutureWarning)
    pd.reset_option('all')

In [252]:
import matplotlib

# Categorical Data

In [256]:
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [257]:
print(df['Sex'].dtype)

object


In [258]:
# Convert a column to categorical
# pandas Categorical type, - new column called 'Sex_C'
# which is more memory- and performance-efficient.
df['Sex_C'] = df['Sex'].astype('category')
df[['Sex', 'Sex_C']].head()

Unnamed: 0,Sex,Sex_C
0,male,male
1,female,female
2,female,female
3,female,female
4,male,male


In [259]:
print(df['Sex'].dtype)
print(df['Sex_C'].dtype)

object
category


In [None]:
df['Sex_C_categorical_codes'] = df['Sex_C'].cat.codes
df[['Sex', 'Sex_C', , 'Sex_C_categorical_codes']].head()

Unnamed: 0,Sex,Sex_C,Sex_C_categorical_codes
0,male,male,1
1,female,female,0
2,female,female,0
3,female,female,0
4,male,male,1


In [262]:
df['Sex_apply_category'] = df['Sex'].apply(lambda x : 0 if x == 'male' else 1)
# creates a new column Sex_apply_category in a pandas DataFrame df 
# by applying a lambda function to the Sex column


In [263]:
df[['Sex', 'Sex_C', 'Sex_C_categorical_codes', 'Sex_apply_category']].head()

Unnamed: 0,Sex,Sex_C,Sex_C_categorical_codes,Sex_apply_category
0,male,male,1,0
1,female,female,0,1
2,female,female,0,1
3,female,female,0,1
4,male,male,1,0


In [264]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_C,Sex_C_categorical_codes,Sex_apply_category
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,male,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,female,0,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,female,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,female,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,male,1,0


In [265]:
df['Pclass'].unique()

array([3, 1, 2])

In [266]:
df['Pclass_new_weird'] = df['Pclass'].apply(lambda x : 'adf' if x == 3 else ('wer' if x == 2 else 'tyu'))
df[['Pclass', 'Pclass_new_weird']].head(10)

Unnamed: 0,Pclass,Pclass_new_weird
0,3,adf
1,1,tyu
2,3,adf
3,1,tyu
4,3,adf
5,3,adf
6,1,tyu
7,3,adf
8,3,adf
9,2,wer


In [267]:
print(df['Pclass'].dtype)
print(df['Pclass_new_weird'].dtype)

int64
object


In [268]:
df['Pclass_new_weird_ordered'] = pd.Categorical(
    df['Pclass_new_weird'],
    categories=['wer','adf', 'tyu'], # Custom order (ascending)
    ordered=True
)

print(df['Pclass_new_weird_ordered'].dtype)

# pandas.Categorical to convert a column into an ordered categorical type with a custom order of categories
# Converts df['Pclass_new_weird'] into a categorical variable with a specific order: 
# 'wer' < 'adf' < 'tyu'.
# ordered=True flag makes comparison operators like <, > meaningful
# Any value not in that list becomes NaN.


category


In [270]:
df['Pclass_new_weird_ordered'].cat.categories

Index(['wer', 'adf', 'tyu'], dtype='object')

In [269]:
df['Pclass_new_weird_ordered'].isna().sum()

np.int64(0)

# Date Functionality in pandas

In [271]:
df = pd.DataFrame({
    'event': ['Concert', 'Conference', 'Wedding', 'Reception', 'last_day'],
    'date': ['2025-01-01', '2025-03-15', '2025-07-20', '2025-07-21', '2025-07-31']
})
df

Unnamed: 0,event,date
0,Concert,2025-01-01
1,Conference,2025-03-15
2,Wedding,2025-07-20
3,Reception,2025-07-21
4,last_day,2025-07-31


In [273]:
df['date_pandas'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,event,date,date_pandas
0,Concert,2025-01-01,2025-01-01
1,Conference,2025-03-15,2025-03-15
2,Wedding,2025-07-20,2025-07-20
3,Reception,2025-07-21,2025-07-21
4,last_day,2025-07-31,2025-07-31


In [274]:
print(df.dtypes[['event', 'date', 'date_pandas']])

event                  object
date                   object
date_pandas    datetime64[ns]
dtype: object


In [None]:
df = df.drop('date', axis=1) # will drop the series named "date"
df

Unnamed: 0,event,date_pandas
0,Concert,2025-01-01
1,Conference,2025-03-15
2,Wedding,2025-07-20
3,Reception,2025-07-21
4,last_day,2025-07-31


In [278]:
# df['date_pandas'].dt.year
df['date_pandas'].dt.month

0    1
1    3
2    7
3    7
4    7
Name: date_pandas, dtype: int32

In [279]:
df['date_pandas'].dt.day

0     1
1    15
2    20
3    21
4    31
Name: date_pandas, dtype: int32

In [280]:
df['date_pandas'].dt.weekday

0    2
1    5
2    6
3    0
4    3
Name: date_pandas, dtype: int32

In [281]:
df['date_pandas'].dt.day_name()

0    Wednesday
1     Saturday
2       Sunday
3       Monday
4     Thursday
Name: date_pandas, dtype: object

In [282]:
df['date_pandas'].dt.is_month_end

0    False
1    False
2    False
3    False
4     True
Name: date_pandas, dtype: bool

In [283]:
df[df['date_pandas'] > '2025-04-01']

Unnamed: 0,event,date_pandas
2,Wedding,2025-07-20
3,Reception,2025-07-21
4,last_day,2025-07-31


In [284]:
df[df['date_pandas'].between('2025-01-01', '2025-06-01')]

Unnamed: 0,event,date_pandas
0,Concert,2025-01-01
1,Conference,2025-03-15


# Date range

In [285]:
pd.date_range(start='2024-01-01', end='2024-01-10', freq='D')

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10'],
              dtype='datetime64[ns]', freq='D')

In [287]:
pd.date_range(start='2024-01-01', end='2024-03-10', freq='ME') # Month end

DatetimeIndex(['2024-01-31', '2024-02-29'], dtype='datetime64[ns]', freq='ME')

In [288]:
pd.date_range(start='2024-01-01', end='2024-03-10', freq='MS')

DatetimeIndex(['2024-01-01', '2024-02-01', '2024-03-01'], dtype='datetime64[ns]', freq='MS')

# Date Arithmetic

In [290]:
df['next_week'] = df['date_pandas'] + pd.Timedelta(days=7)
df[['date_pandas', 'next_week']]

Unnamed: 0,date_pandas,next_week
0,2025-01-01,2025-01-08
1,2025-03-15,2025-03-22
2,2025-07-20,2025-07-27
3,2025-07-21,2025-07-28
4,2025-07-31,2025-08-07


In [291]:
df['days_to_event'] = df['date_pandas'] - pd.Timestamp.today()
df.head()

Unnamed: 0,event,date_pandas,next_week,days_to_event
0,Concert,2025-01-01,2025-01-08,-190 days +07:50:10.836513
1,Conference,2025-03-15,2025-03-22,-117 days +07:50:10.836513
2,Wedding,2025-07-20,2025-07-27,10 days 07:50:10.836513
3,Reception,2025-07-21,2025-07-28,11 days 07:50:10.836513
4,last_day,2025-07-31,2025-08-07,21 days 07:50:10.836513


In [292]:
# Assignment - 1

# TODO - While creating a task, ask user about the deadline

# Start date - Pick automatically from today's date
# Deadline - 7, 10, 10

# # to-DO

# 1. Task-1 | Days left  | Green-color
# 2. Task-2 | Days left  | Red-color
# 3. Task-3 | Days left  | Green-color
# 4. Task-4 | Days left  | Red-color