# Pandas

Python library for data analysis and manipulation

Two main data structures
- 1D Series (list)
- 2D DataFrame (matrix)

Why ?
- Support for CSV, Excel, SQL, JSON etc
- Powerful data aggregation and grouping
- Easy handling of missing
- Convenient for filtering, joining, and reshaping

In [1]:
import numpy as np
import pandas as pd

In [3]:
friends = {
    'name': ['Nitin', 'Shyam', 'Pavan'],
    'city': ['Bangalore', 'Delhi', 'Vishakha'],
    'marks': [98, 99, 95]
}

In [4]:
friends

{'name': ['Nitin', 'Shyam', 'Pavan'],
 'city': ['Bangalore', 'Delhi', 'Vishakha'],
 'marks': [98, 99, 95]}

In [5]:
# Create dataframe
df = pd.DataFrame(friends)
df

Unnamed: 0,name,city,marks
0,Nitin,Bangalore,98
1,Shyam,Delhi,99
2,Pavan,Vishakha,95


In [7]:
# Saving dataframe to files
df.to_csv('friends.csv')

In [8]:
# Saving dataframe to files without index
df.to_csv('friends.csv', index=False)

In [9]:
# Top of dataframe
df.head()

Unnamed: 0,name,city,marks
0,Nitin,Bangalore,98
1,Shyam,Delhi,99
2,Pavan,Vishakha,95


In [10]:
df.head(2)

Unnamed: 0,name,city,marks
0,Nitin,Bangalore,98
1,Shyam,Delhi,99


In [11]:
# Last of dataframe
df.tail(2)

Unnamed: 0,name,city,marks
1,Shyam,Delhi,99
2,Pavan,Vishakha,95


In [12]:
df.describe()

Unnamed: 0,marks
count,3.0
mean,97.333333
std,2.081666
min,95.0
25%,96.5
50%,98.0
75%,98.5
max,99.0


In [13]:
df.describe(include=object)

Unnamed: 0,name,city
count,3,3
unique,3,3
top,Nitin,Bangalore
freq,1,1


In [20]:
# Read a csv file
friendsRead = pd.read_csv('friends-read.csv')

In [21]:
friendsRead

Unnamed: 0,name,city,marks
0,Nitin,Bangalore,98
1,Shyam,Delhi,99
2,PAVAN,Vishakha,100
3,Anjali,Mumbai,95
4,Ravi,Chennai,88
5,Meena,Hyderabad,92
6,Aman,Pune,85
7,Sneha,Kolkata,93
8,Raj,Ahmedabad,90
9,Divya,Jaipur,97


In [22]:
friendsRead.describe()

Unnamed: 0,marks
count,10.0
mean,93.7
std,4.98999
min,85.0
25%,90.5
50%,94.0
75%,97.75
max,100.0


In [23]:
# Access values - [col][row]
friendsRead['name']

0     Nitin
1     Shyam
2     PAVAN
3    Anjali
4      Ravi
5     Meena
6      Aman
7     Sneha
8       Raj
9     Divya
Name: name, dtype: object

In [24]:
friendsRead['name'][3]

'Anjali'

In [25]:
friendsRead['name'][3] = 'Anjalika'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  friendsRead['name'][3] = 'Anjalika'


In [26]:
friendsRead['name'][3]

'Anjalika'

In [29]:
friendsRead.to_csv('friends-read.csv', index=False)

In [28]:
friendsRead

Unnamed: 0,name,city,marks
0,Nitin,Bangalore,98
1,Shyam,Delhi,99
2,PAVAN,Vishakha,100
3,Anjalika,Mumbai,95
4,Ravi,Chennai,88
5,Meena,Hyderabad,92
6,Aman,Pune,85
7,Sneha,Kolkata,93
8,Raj,Ahmedabad,90
9,Divya,Jaipur,97


In [30]:
friendsRead.index = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',]

In [31]:
friendsRead

Unnamed: 0,name,city,marks
a,Nitin,Bangalore,98
b,Shyam,Delhi,99
c,PAVAN,Vishakha,100
d,Anjalika,Mumbai,95
e,Ravi,Chennai,88
f,Meena,Hyderabad,92
g,Aman,Pune,85
h,Sneha,Kolkata,93
i,Raj,Ahmedabad,90
j,Divya,Jaipur,97


In [33]:
friendsRead.to_csv('friends-read.csv')

### Series Data Structure

In [35]:
sr = pd.Series([1,2,3,4,5])
sr

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [36]:
type(sr)

pandas.core.series.Series

In [37]:
type(friendsRead)

pandas.core.frame.DataFrame

In [41]:
# random series
randomSr = pd.Series( np.random.randint(1,10, size=10) )
randomSr

0    3
1    2
2    9
3    7
4    3
5    2
6    6
7    5
8    8
9    7
dtype: int64

In [42]:
randomDf = pd.DataFrame(np.random.randint(1,101, size=(5,5)))
randomDf

Unnamed: 0,0,1,2,3,4
0,60,3,24,5,10
1,1,74,77,56,62
2,92,65,10,70,31
3,13,79,91,11,85
4,34,65,18,71,25


In [44]:
randomDf.describe()

Unnamed: 0,0,1,2,3,4
count,5.0,5.0,5.0,5.0,5.0
mean,40.0,57.2,44.0,42.6,42.6
std,36.70831,30.890128,37.181985,32.207142,30.336447
min,1.0,3.0,10.0,5.0,10.0
25%,13.0,65.0,18.0,11.0,25.0
50%,34.0,65.0,24.0,56.0,31.0
75%,60.0,74.0,77.0,70.0,62.0
max,92.0,79.0,91.0,71.0,85.0


In [53]:
randomDf1 = pd.DataFrame(np.random.randint(1,101, size=(200,5)))
randomDf1.head()

Unnamed: 0,0,1,2,3,4
0,70,10,50,8,2
1,90,63,82,14,97
2,48,13,35,45,56
3,11,80,56,9,58
4,84,70,19,11,48


In [49]:
randomDf1.head()

Unnamed: 0,0,1,2,3,4
0,30,78,49,46,66
1,48,46,7,95,75
2,42,52,33,47,21
3,78,53,90,7,43
4,77,79,57,74,50


In [47]:
randomDf1.tail()

Unnamed: 0,0,1,2,3,4
195,28,73,72,65,75
196,24,76,26,95,37
197,81,9,26,61,42
198,78,75,69,7,64
199,67,20,41,3,26


In [50]:
randomDf1.index

RangeIndex(start=0, stop=200, step=1)

In [51]:
randomDf1.columns

RangeIndex(start=0, stop=5, step=1)

In [52]:
randomDf1.to_numpy()

array([[ 30,  78,  49,  46,  66],
       [ 48,  46,   7,  95,  75],
       [ 42,  52,  33,  47,  21],
       [ 78,  53,  90,   7,  43],
       [ 77,  79,  57,  74,  50],
       [  1,  52,  62,  29,  42],
       [ 59,  34,  87,  44,  80],
       [ 84,  31,  56,  78,  90],
       [ 85,  15,  25,  99,  97],
       [ 56,  99,  51,  25,  88],
       [ 68,  71,  14,  22,  74],
       [ 43,  34,  72,  38,  25],
       [ 66,  87,  65,  19,  92],
       [ 21,  21,  38,  57,  54],
       [  3,  27,  68,  96,  40],
       [ 10,  73,  62,  74,  48],
       [  2,  21,  71,  77,  84],
       [ 64,  15,  29,  83,  29],
       [ 90,  28,  59,  45,  41],
       [ 75,  50,  54,  81,  31],
       [ 31,  36,  46,  58,  60],
       [ 89,  65,  72,  88,  64],
       [ 82,  53,   8,   4,  19],
       [ 47,  68,  24,  38,  55],
       [ 54,  24,  38,  43,  17],
       [ 62,  67,  35,  36,  30],
       [ 64,  81,  87,  47,  94],
       [ 24,  65,   9,  43,  27],
       [ 14,  22,  79,  92,  17],
       [ 46,  

In [55]:
sr2 = pd.Series([1,2,3], index=['a', 'b', 'c'])
sr2

a    1
b    2
c    3
dtype: int64

In [56]:
sr2['c']

3

In [57]:
df2 = pd.DataFrame([
    ['Tom', 28],
    ['Jerry', 31]
], columns=['Name', 'Age'])
df2

Unnamed: 0,Name,Age
0,Tom,28
1,Jerry,31


In [58]:
df2['City'] = ['Delhi', 'Mumbai']

In [59]:
df2

Unnamed: 0,Name,Age,City
0,Tom,28,Delhi
1,Jerry,31,Mumbai


In [60]:
df2.drop('Age', axis=1)

Unnamed: 0,Name,City
0,Tom,Delhi
1,Jerry,Mumbai


In [61]:
df2

Unnamed: 0,Name,Age,City
0,Tom,28,Delhi
1,Jerry,31,Mumbai


In [62]:
df2.drop('Age', axis=1, inplace=True)

In [63]:
df2

Unnamed: 0,Name,City
0,Tom,Delhi
1,Jerry,Mumbai
