## Pandas
> handling and manipulating structured (tabular) data

In [174]:
import pandas as pd # import pandas

# it is built on top of NumPy

### data structures

#### Series
a Series is like a numpy 1-d array but with labels (index)

In [175]:
a = pd.Series([10, 20, 30, 40])
a

0    10
1    20
2    30
3    40
dtype: int64

In [176]:
a.ndim # number of dimensions

1

In [177]:
a.shape # number of elements

(4,)

In [178]:
a.size # total number of elements

4

In [179]:
a.dtype # data type

dtype('int64')

In [180]:
# custom indexing
b = pd.Series([100, 200, 300], index = ['a', 'b', 'c'])
b

a    100
b    200
c    300
dtype: int64

In [181]:
# accessing elements
b['a'] # label-based access

100

In [182]:
b[0] # position-based access

100

#### DataFrame
a DataFrame is a 2-d table (rows and columns)

> from list of lists

In [183]:
data = [
    ['A', 20],
    ['B', 21],
    ['C', 22]
]

df = pd.DataFrame(data, columns = ['Name', 'Age'])
df

Unnamed: 0,Name,Age
0,A,20
1,B,21
2,C,22


> from dictionary

In [184]:
data = {
    'Name' : ['A', 'B', 'C'],
    'Age' : [20, 21, 22],
    'Marks' : [85, 90, 88]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Marks
0,A,20,85
1,B,21,90
2,C,22,88


In [185]:
df.ndim # number of dimensions

2

In [186]:
df.shape # (rows, columns)

(3, 3)

In [187]:
df.size # total elements

9

In [188]:
df.columns # column names

Index(['Name', 'Age', 'Marks'], dtype='object')

In [189]:
df.index # row index

RangeIndex(start=0, stop=3, step=1)

In [190]:
df.dtypes # data types of each column

Name     object
Age       int64
Marks     int64
dtype: object

### basic inspection methods

In [191]:
df.head() # first 5 rows

Unnamed: 0,Name,Age,Marks
0,A,20,85
1,B,21,90
2,C,22,88


In [192]:
df.tail() # last 5 rows

Unnamed: 0,Name,Age,Marks
0,A,20,85
1,B,21,90
2,C,22,88


In [193]:
df.info() # summary of dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   Marks   3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes


In [194]:
df.describe() # statistical summary (numeric columns only)

Unnamed: 0,Age,Marks
count,3.0,3.0
mean,21.0,87.666667
std,1.0,2.516611
min,20.0,85.0
25%,20.5,86.5
50%,21.0,88.0
75%,21.5,89.0
max,22.0,90.0


#### selecting column

In [195]:
df['Age']

0    20
1    21
2    22
Name: Age, dtype: int64

#### selecting multiple columns

In [196]:
df[['Name', 'Marks']]

Unnamed: 0,Name,Marks
0,A,85
1,B,90
2,C,88


#### selecting rows

> using loc (label-based)

In [197]:
df.loc[0] # row with label 0

Name      A
Age      20
Marks    85
Name: 0, dtype: object

In [198]:
df.loc[0, 'Name'] # specific element

'A'

> using iloc (position-based)

In [199]:
df.iloc[1] # second row

Name      B
Age      21
Marks    90
Name: 1, dtype: object

In [200]:
df.iloc[1, 2] # row 1, column 2

90

#### slicing

In [201]:
df[0:2] # first two rows

Unnamed: 0,Name,Age,Marks
0,A,20,85
1,B,21,90


In [202]:
df.iloc[0:2, 0:2] # row slice, column slice

Unnamed: 0,Name,Age
0,A,20
1,B,21


#### adding a column

In [203]:
df['Grade'] = ['A', 'A', 'B']
df

Unnamed: 0,Name,Age,Marks,Grade
0,A,20,85,A
1,B,21,90,A
2,C,22,88,B


#### deleting a column

In [204]:
df = df.drop('Grade', axis = 1)
df

Unnamed: 0,Name,Age,Marks
0,A,20,85
1,B,21,90
2,C,22,88


#### renaming column

In [205]:
df = df.rename(columns = {'Marks' : 'Score'})
df

Unnamed: 0,Name,Age,Score
0,A,20,85
1,B,21,90
2,C,22,88


#### sorting

In [206]:
df.sort_values('Age')

Unnamed: 0,Name,Age,Score
0,A,20,85
1,B,21,90
2,C,22,88


#### boolean filtering

In [207]:
df[df['Age'] > 20]
# this returns rows where condition is True

Unnamed: 0,Name,Age,Score
1,B,21,90
2,C,22,88


#### creating missing values

In [208]:
df.loc[1, 'Age'] = None
df

Unnamed: 0,Name,Age,Score
0,A,20.0,85
1,B,,90
2,C,22.0,88


#### checking missing values

In [209]:
df.isna() # remember this from ms excel...

Unnamed: 0,Name,Age,Score
0,False,False,False
1,False,True,False
2,False,False,False


In [210]:
df.isna().sum()

Name     0
Age      1
Score    0
dtype: int64

#### dropping missing values

In [211]:
df.dropna()

Unnamed: 0,Name,Age,Score
0,A,20.0,85
2,C,22.0,88


#### filling missing values

In [212]:
df.fillna(0)

Unnamed: 0,Name,Age,Score
0,A,20.0,85
1,B,0.0,90
2,C,22.0,88


### csv support

#### reading csv file

In [213]:
# df = pd.read_csv('file.csv')

#### saving csv file

In [214]:
df.to_csv('output.csv', index = False)
# index = False --> prevents saving row numbers

chapter 1 completed...

to be continued...