# Pandas
Pandas is a tool (or library) in Python that helps you work with data easily.
It's fast, powerful, and makes working with data way easier — especially for data science, machine learning, and reports.

Imagine you have a big table of data — like an Excel sheet with rows and columns. Pandas helps you:

- Read data (from files like CSV or Excel)

- Organize it

- Clean it (remove or fix bad data)

- Analyze it (find patterns or answers)

- Visualize it (make simple charts)

## Accessing file

In [2]:
# Importing pandas library
import pandas as pd


In [4]:
# Reading our csv file and printing it out
df = pd.read_csv('tested.csv')

# printing it to get all the rows and columns
df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [7]:
# If we want to see all the rows of our data
pd.set_option('display.max.rows', 419)

# Displaying after changing settings
df


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,0,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,1,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,0,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,1,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,0,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [None]:
# Printing the first 5 rows using head
# head shows first 5 rows by default, however can show as many rows as you want by providing that value to head
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
# Displays the last 7 rows
df.tail(7)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
411,1303,1,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37.0,1,0,19928,90.0,C78,Q
412,1304,1,3,"Henriksson, Miss. Jenny Lovisa",female,28.0,0,0,347086,7.775,,S
413,1305,0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
414,1306,1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
415,1307,0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
416,1308,0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
417,1309,0,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


## Exploring data

In [None]:
# Shape tells about the column and rows of data
df.shape

# So it has 418 rows and 12 columns

(418, 12)

In [None]:
# info() Tells the datatypes and non-null values in each column, total cols & rows and index from start to end
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [None]:
# Shows the stats of each column values like count, max, min etc
df.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [10]:
# Can get all the rows for  any single column however can't get value of specific index like this
df['Survived']


Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1
5,0
6,1
7,0
8,1
9,0


In [16]:
# We will now get value of a specific index using loc
# Use loc when you're working with names (like column names or row labels).
df.loc[410]


Unnamed: 0,410
PassengerId,1302
Survived,1
Pclass,3
Name,"Naughton, Miss. Hannah"
Sex,female
Age,
SibSp,0
Parch,0
Ticket,365237
Fare,7.75


In [13]:
# Use iloc when you're working with positions (like row 0, column 1).
df.iloc[410]


Unnamed: 0,410
PassengerId,1302
Survived,1
Pclass,3
Name,"Naughton, Miss. Hannah"
Sex,female
Age,
SibSp,0
Parch,0
Ticket,365237
Fare,7.75


## Handling Missing Values

In [None]:
# Checking missing values in each column, shows 86 missing values for age
# Taking out the missing values using isnull() and then adding them to see total missing values using sum()
df.isnull().sum()


Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1


In [None]:
# Filling the missing values using mean()
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Cabin'].fillna(df['Age'].mean(), inplace=True)

# Checking the null values again
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          418 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        418 non-null    object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna(df['Age'].mean(), inplace=True)


In [None]:
# If there are alot of missing values, you can also drop them out
# dropna() deletes the missing values
df.dropna()


## Dropping Columns

In [None]:
# We can drop vcolumn that are not needed as it might make it complicated to keep them all
df.drop(columns = ['SibSp', 'Parch', 'Cabin', 'Embarked', 'Ticket', 'PassengerId' ], inplace = True)


In [None]:
df.info()
# Columns have been removed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Name      418 non-null    object 
 3   Sex       418 non-null    object 
 4   Age       418 non-null    float64
 5   Fare      417 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 19.7+ KB


In [None]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare
0,0,3,"Kelly, Mr. James",male,34.5,7.8292
1,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,7.0
2,0,2,"Myles, Mr. Thomas Francis",male,62.0,9.6875
3,0,3,"Wirz, Mr. Albert",male,27.0,8.6625
4,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,12.2875


In [None]:
# iloc means "index location". It is used to pick rows and columns by their position (number) — not by name.
x = df.iloc[:, 1:]
x


Unnamed: 0,Pclass,Name,Sex,Age,Fare
0,3,"Kelly, Mr. James",male,34.50000,7.8292
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.00000,7.0000
2,2,"Myles, Mr. Thomas Francis",male,62.00000,9.6875
3,3,"Wirz, Mr. Albert",male,27.00000,8.6625
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.00000,12.2875
...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,30.27259,8.0500
414,1,"Oliva y Ocana, Dona. Fermina",female,39.00000,108.9000
415,3,"Saether, Mr. Simon Sivertsen",male,38.50000,7.2500
416,3,"Ware, Mr. Frederick",male,30.27259,8.0500


In [None]:
# Showimg survived column as y
y = df['Survived']
y


Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0
