## Installation

In [1]:
import pandas as pd
import numpy as np
pd.__version__

'2.0.3'

In [2]:
pd.<Tab> # display all the contents of the pandas namespace

SyntaxError: invalid syntax (170531711.py, line 1)

In [3]:
pd?

# The pandas Series Object

A pandas series is a one-dimensional array of indexed data.It can be created from a list of array as follows:

In [4]:
data = pd.Series([0.25,0.5,0.75,1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [5]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [6]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
data[1]

0.5

In [8]:
data[0]

0.25

In [9]:
data[1:3]

1    0.50
2    0.75
dtype: float64

The difference of pandas sereis and numpy object is that while the Numpy Array has an impicitly defined integer index used to access the values, the pandas series has an explicitly defined associated with the values. 

In [10]:
data = pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])

data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [11]:
data['b']

0.5

In [12]:
# we can also use non-contiguous or non-sequential indices:

data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index = [2,5,3,7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

## Series as specialized dictionary

In [13]:
population_dict = {'Dhaka':3833252,
                  'Chittagong':2644819,
                  'Rajshahi':1954321,
                  'Khulna':2829102}
population = pd.Series(population_dict)
population

Dhaka         3833252
Chittagong    2644819
Rajshahi      1954321
Khulna        2829102
dtype: int64

In [14]:
population['Dhaka']

3833252

In [15]:
population['Chittagong':'Khulna']

Chittagong    2644819
Rajshahi      1954321
Khulna        2829102
dtype: int64

## Constructing series objects

In [16]:
# data can be scalar
pd.Series(5,index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [17]:
# data can be dictionary , in which index defaults to the sorted dictionary keys:

pd.Series({2:'a',1:'b',3:'c'})

2    a
1    b
3    c
dtype: object

In [18]:
# the index can be explicitly set if a different result is preferred:

pd.Series({2:'a',1:'b',3:'c'},index=[3,2])

3    c
2    a
dtype: object

## The Pandas DataFrame Object

In [19]:
population

Dhaka         3833252
Chittagong    2644819
Rajshahi      1954321
Khulna        2829102
dtype: int64

In [20]:
area_dict = {'Dhaka':232,'Chittagong':111,'Rajshahi':145,'Khulna':543}
area = pd.Series(area_dict)
area

Dhaka         232
Chittagong    111
Rajshahi      145
Khulna        543
dtype: int64

In [21]:
# creating 2-d object 

states = pd.DataFrame({'population':population,
                      'area':area})
states

Unnamed: 0,population,area
Dhaka,3833252,232
Chittagong,2644819,111
Rajshahi,1954321,145
Khulna,2829102,543


In [22]:
states.index

Index(['Dhaka', 'Chittagong', 'Rajshahi', 'Khulna'], dtype='object')

In [23]:
states.columns

Index(['population', 'area'], dtype='object')

In [24]:
states['area'] # as specialized dictionary

Dhaka         232
Chittagong    111
Rajshahi      145
Khulna        543
Name: area, dtype: int64

so, A dataframe is a collection of series objects , and a single-column Dataframe can be constructed from a single series:

In [25]:
pd.DataFrame(population,columns=['population'])

Unnamed: 0,population
Dhaka,3833252
Chittagong,2644819
Rajshahi,1954321
Khulna,2829102


### From a list of dicts

In [26]:
data = [{'a':i,'b':2*i} for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [27]:
pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


### From a dictionary of series objects

In [29]:
pd.DataFrame({'population':population,'Area':area})

Unnamed: 0,population,Area
Dhaka,3833252,232
Chittagong,2644819,111
Rajshahi,1954321,145
Khulna,2829102,543


### From a 2-d numpy array

In [38]:
pd.DataFrame(np.random.rand(3,2),
            columns = ['foo','bar'],
            index = ['a','b','c'])

Unnamed: 0,foo,bar
a,0.693623,0.184737
b,0.460908,0.574532
c,0.998346,0.857621


### From a numpy structured array

In [31]:
A = np.zeros(3,dtype=[('A','i8'),('B','f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [32]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


# The Pandas Index Object

This index object is an interesting structure in itself, and it can be thought of either as an immutable array or as an ordered set

In [33]:
ind = pd.Index([2,3,5,7,11])
ind

Index([2, 3, 5, 7, 11], dtype='int64')

## Index as immutable array

In [34]:
ind[1]

3

In [35]:
ind[::2]

Index([2, 5, 11], dtype='int64')

In [36]:
print(ind.size , ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [37]:
# diff between index objects and numpy arrays is that indices are immutable 
ind[1] = 0

TypeError: Index does not support mutable operations

## Index as ordered set

In [38]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])
indA & indB # intersection

Index([0, 3, 5, 7, 9], dtype='int64')

In [39]:
indA | indB #union

Index([3, 3, 5, 7, 11], dtype='int64')

In [41]:
indA ^ indB #symmetric difference

Index([3, 0, 0, 0, 2], dtype='int64')

# Data Selection is Series

In [42]:
data = pd.Series ([0.25,0.5,0.75,1.0],
                 index=['a','b','c','d'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [43]:
data['b']

0.5

In [44]:
'a' in data

True

In [45]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [46]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [47]:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

## Series as 1-d array

In [48]:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [49]:
data[0:2]

a    0.25
b    0.50
dtype: float64

In [50]:
data[(data > 0.3) & (data<0.8)] # masking

b    0.50
c    0.75
dtype: float64

In [51]:
# fancy indexing
data[['a','e']]

a    0.25
e    1.25
dtype: float64

## Indexers: loc,iloc,ix

In [52]:
data = pd.Series(['a','b','c'],index=[1,3,5])
data

1    a
3    b
5    c
dtype: object

In [53]:
data[1]

'a'

In [54]:
data[1:3]

3    b
5    c
dtype: object

loc attribute allows indexing and slicing that always references the explicit index

In [55]:
data.loc[1]

'a'

In [56]:
data.loc[1:3]

1    a
3    b
dtype: object

iloc attribute allows indexing and slicing that always references the implicit python - style index:

In [57]:
data.iloc[1]

'b'

In [58]:
data.iloc[1:3]

3    b
5    c
dtype: object

## DataFrame as a dictionary

In [59]:
area = pd.Series({'California':423967, 'Texas':695662,
                 'New York':141297, 'Florida': 170312,
                 'Illinois':149995})
pop = pd.Series({'California':38332521,'Texas':26448193,
                'New York':19651127,'Florida':19552860,
                'Illinois':12882135})
data = pd.DataFrame({'area':area,'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [60]:
data['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [61]:
data['pop']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [62]:
data.area # attribute style access

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [63]:
data.pop

<bound method DataFrame.pop of               area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135>

In [64]:
data.area is data['area']

True

The attribute style access always doesn't work. Like if the column name is not string or conflicts with method name

In [65]:
data.pop is data['pop']

False

In [66]:
# making a new column

data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


## DataFrame as 2-D array

In [67]:
data.values

array([[4.23967000e+05, 3.83325210e+07, 9.04139261e+01],
       [6.95662000e+05, 2.64481930e+07, 3.80187404e+01],
       [1.41297000e+05, 1.96511270e+07, 1.39076746e+02],
       [1.70312000e+05, 1.95528600e+07, 1.14806121e+02],
       [1.49995000e+05, 1.28821350e+07, 8.58837628e+01]])

In [68]:
# transposing
data.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967.0,695662.0,141297.0,170312.0,149995.0
pop,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
density,90.41393,38.01874,139.0767,114.8061,85.88376


In [69]:
data.values[0]

array([4.23967000e+05, 3.83325210e+07, 9.04139261e+01])

In [70]:
data.iloc[:3,:2]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127


In [71]:
data.loc[:'Illinois',:'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [74]:
# ix indexer allows a hybrid of these two appraoches
data.ix[:3,:'pop']

AttributeError: 'DataFrame' object has no attribute 'ix'

In [75]:
# masking + fancy indexing

data.loc[data.density >100,['pop','density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


In [76]:
data.iloc[0,2]=90

In [77]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Texas,695662,26448193,38.01874
New York,141297,19651127,139.076746
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
