In this textbook, we will focus on the mechanics of using Series, DataFrame, and related structures effectively. 

In [0]:
import pandas as pd

The Pandas Series Object 

A Pandas Series is a one-dimensional array of indexed data. It can be created from a list or array as follows:


In [3]:
data=pd.Series([0,0.24,0.45,1.0])
data

0    0.00
1    0.24
2    0.45
3    1.00
dtype: float64

In [4]:
data.values

array([0.  , 0.24, 0.45, 1.  ])

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
data=pd.Series([0.25,0.35,0.45,0.56],index=['a','b','c','d'])
data

a    0.25
b    0.35
c    0.45
d    0.56
dtype: float64

In [8]:
data['b']

0.35

In [10]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],index=[2, 5, 3, 7])      
data


2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

Dictionary format

In [13]:
population_dict={'california':76895353,'Texas':8232829282,'londaon':9392323}
pop=pd.Series(population_dict)
pop

california      76895353
Texas         8232829282
londaon          9392323
dtype: int64

In [14]:
pop['california']

76895353

Constructing series object

In [15]:
pd.Series(5,index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [16]:
pd.Series({2:'a',3:'b'})

2    a
3    b
dtype: object

In [17]:
 pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])


3    c
2    a
dtype: object

In [19]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict) 
area


California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [21]:
 states = pd.DataFrame({'population': pop,'area': area})        
 states


Unnamed: 0,population,area
California,,423967.0
Florida,,170312.0
Illinois,,149995.0
New York,,141297.0
Texas,8232829000.0,695662.0
california,76895350.0,
londaon,9392323.0,


In [22]:
states['area']

California    423967.0
Florida       170312.0
Illinois      149995.0
New York      141297.0
Texas         695662.0
california         NaN
londaon            NaN
Name: area, dtype: float64

From a list of dicts.    Any list of dictionaries can be made into a DataFrame. We’ll use a simple list comprehension to create some data:


In [24]:
data=[{'a':i,'b':2*i}
       for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [25]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])


Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


The panadas index object

In [27]:
 ind = pd.Index([2, 3, 5, 7, 11])        
 ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [28]:
ind[1]

3

In [29]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [32]:
indA = pd.Index([1, 3, 5, 7, 9])        
indB = pd.Index([2, 3, 5, 7, 11])

indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

In [33]:
indA & indB


Int64Index([3, 5, 7], dtype='int64')

In [34]:
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

**Data Selection in Series **

In [35]:
import pandas as pd      
data = pd.Series([0.25, 0.5, 0.75, 1.0],                        
index=['a', 'b', 'c', 'd'])       
data


a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [36]:
'a' in data

True

In [37]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [46]:
data_2=data.copy()
data_2

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [50]:
dff=pd.DataFrame({'float':[1.0],'int':[1]})
dff.columns

Index(['float', 'int'], dtype='object')

In [51]:
dff.dtypes

float    float64
int        int64
dtype: object

In [52]:
df_empty = pd.DataFrame({'A' : []})
df_empty

Unnamed: 0,A


In [53]:
df_empty.empty

True

In [55]:
df1=pd.DataFrame([[0,2,3],[0,9,8]],columns=['a','b','c'])
df1

Unnamed: 0,a,b,c
0,0,2,3
1,0,9,8


In [57]:
df1.iat[0,0]
# give value at specific row/column

0

In [59]:
# Set value at specified row/column pair
df1.iat[0,0]=90
df1

Unnamed: 0,a,b,c
0,90,2,3
1,0,9,8


In [0]:
mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
          {'a': 100, 'b': 200, 'c': 300, 'd': 400},
          {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
df=pd.DataFrame(mydict)

In [61]:
df

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400
2,1000,2000,3000,4000


INDEXING JUST THE ROWS

In [62]:
df.iloc[0]

a    1
b    2
c    3
d    4
Name: 0, dtype: int64

In [63]:
df.iloc[0,1]

2

In [64]:
df.iloc[[0,1]]

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400


with a slice object

In [65]:
df.iloc[:3]

Unnamed: 0,a,b,c,d
0,1,2,3,4
1,100,200,300,400
2,1000,2000,3000,4000


In [66]:
df.iloc[[True,False,True]]

Unnamed: 0,a,b,c,d
0,1,2,3,4
2,1000,2000,3000,4000


ValueError: ignored