In [1]:
import pandas as pd
import numpy as np

In [2]:
# a dataframe is simply multiple series that share the same index
# a tabular data storage format

In [3]:
from numpy.random import randn
np.random.seed(101)

rand_mat = randn(5,4)
print(rand_mat)

[[ 2.70684984  0.62813271  0.90796945  0.50382575]
 [ 0.65111795 -0.31931804 -0.84807698  0.60596535]
 [-2.01816824  0.74012206  0.52881349 -0.58900053]
 [ 0.18869531 -0.75887206 -0.93323722  0.95505651]
 [ 0.19079432  1.97875732  2.60596728  0.68350889]]


In [4]:
dframe = pd.DataFrame(data=rand_mat, index=['a', 'b', 'c', 'd', 'e'], columns=['w', 'x', 'y', 'z'])

In [5]:
print(dframe)

          w         x         y         z
a  2.706850  0.628133  0.907969  0.503826
b  0.651118 -0.319318 -0.848077  0.605965
c -2.018168  0.740122  0.528813 -0.589001
d  0.188695 -0.758872 -0.933237  0.955057
e  0.190794  1.978757  2.605967  0.683509


In [6]:
type(dframe)

pandas.core.frame.DataFrame

In [7]:
# select by column
dframe['w']

a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [8]:
# select by index
dframe.loc['a']

w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [9]:
# select by numeric index
dframe.iloc[0]

w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [10]:
# custom selection list for rows ['', ''], list for columns [''. '']
dframe.loc[['a', 'b'],['y', 'z']]

Unnamed: 0,y,z
a,0.907969,0.503826
b,-0.848077,0.605965


In [11]:
df_bool = dframe > 0

In [12]:
# filter by logical condition
dframe[df_bool]

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,,,0.605965
c,,0.740122,0.528813,
d,0.188695,,,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [13]:
dframe['w'] > 0

a     True
b     True
c    False
d     True
e     True
Name: w, dtype: bool

In [14]:
dframe[dframe['w'] > 0]


Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [15]:
# multiple conditional selection use & and | instead 'and', 'or'
dframe[(dframe['w']>0) & (dframe['y']>1)]

Unnamed: 0,w,x,y,z
e,0.190794,1.978757,2.605967,0.683509


In [16]:
# select a specific filtered element
dframe[(dframe['w']>0) & (dframe['y']>1)]['w'].loc['e']

0.19079432237171562

In [17]:
# return the index
dframe.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [18]:
# convert the index to dataframe information
# by default the new index will start from 0 
dframe.reset_index()

Unnamed: 0,index,w,x,y,z
0,a,2.70685,0.628133,0.907969,0.503826
1,b,0.651118,-0.319318,-0.848077,0.605965
2,c,-2.018168,0.740122,0.528813,-0.589001
3,d,0.188695,-0.758872,-0.933237,0.955057
4,e,0.190794,1.978757,2.605967,0.683509


In [19]:
# changing the index
states = ['CO', 'CT', 'DC', 'TX', 'NY']
dframe['States'] = states
dframe.set_index('States')

Unnamed: 0_level_0,w,x,y,z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CO,2.70685,0.628133,0.907969,0.503826
CT,0.651118,-0.319318,-0.848077,0.605965
DC,-2.018168,0.740122,0.528813,-0.589001
TX,0.188695,-0.758872,-0.933237,0.955057
NY,0.190794,1.978757,2.605967,0.683509


In [20]:
#summary of the df
dframe.info

<bound method DataFrame.info of           w         x         y         z States
a  2.706850  0.628133  0.907969  0.503826     CO
b  0.651118 -0.319318 -0.848077  0.605965     CT
c -2.018168  0.740122  0.528813 -0.589001     DC
d  0.188695 -0.758872 -0.933237  0.955057     TX
e  0.190794  1.978757  2.605967  0.683509     NY>

In [21]:
dframe.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   w       5 non-null      float64
 1   x       5 non-null      float64
 2   y       5 non-null      float64
 3   z       5 non-null      float64
 4   States  5 non-null      object 
dtypes: float64(4), object(1)
memory usage: 412.0+ bytes


In [22]:
dframe.dtypes

w         float64
x         float64
y         float64
z         float64
States     object
dtype: object

In [23]:
dframe.describe()

Unnamed: 0,w,x,y,z
count,5.0,5.0,5.0,5.0
mean,0.343858,0.453764,0.452287,0.431871
std,1.681131,1.061385,1.454516,0.594708
min,-2.018168,-0.758872,-0.933237,-0.589001
25%,0.188695,-0.319318,-0.848077,0.503826
50%,0.190794,0.628133,0.528813,0.605965
75%,0.651118,0.740122,0.907969,0.683509
max,2.70685,1.978757,2.605967,0.955057


In [24]:
# count boleans to sumarize
ser_w = dframe['w'] > 0
print(ser_w.value_counts())

w
True     4
False    1
Name: count, dtype: int64
