In [2]:
import numpy as np
import pandas as pd

In [3]:
from numpy.random import randn

In [4]:
np.random.seed(101) #sets the seed so it give the same random numbers

In [5]:
df = pd.DataFrame(randn(5,4), ['A', 'B', 'C', 'D', 'E'], ['W','X','Y','Z'])
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [6]:
df['W'] #this returns series so dataframe is just a bunch of series that share the same index

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [7]:
df[['W','X']]

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


In [8]:
df['New'] = df['W'] + df['X']
df

Unnamed: 0,W,X,Y,Z,New
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [9]:
df.drop('New', axis=1, inplace=True) #when dropping, it default looks for the index you entered [A,B,C,D,E] in our case New is not a part of the index but rather it is a part of the coloums [W,X,Y,Z,New]. So by default the axis is set to 0, if we are dropping a column you change axis to 1

In [10]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [11]:
df.drop('E', inplace=True) #drops the row E

In [12]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [50]:
# ROWS

In [52]:
df

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049


In [54]:
df.loc['A'] #returns a series

W    0.093628
X    1.240813
Y   -1.097693
Z   -1.908009
Name: A, dtype: float64

In [56]:
df.iloc[2] #turns index into a numerical values, since C is at location/index 2 it returns series containing row C

W    0.178009
X   -0.626805
Y   -0.391089
Z    1.743477
Name: C, dtype: float64

In [58]:
df.loc['B','Y']

-2.7369945956467303

In [73]:
df.loc[['A','B'],['Z','Y', 'X']]

Unnamed: 0,Z,Y,X
A,-1.908009,-1.097693,1.240813
B,1.522562,-2.736995,-1.666059


In [75]:
 booldf = df > 0 #checks where df value are > 0

In [76]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,False,False,False,True
C,True,False,False,True
D,True,True,True,False


In [78]:
df[booldf] #returns where it is true

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,,
B,,,,1.522562
C,0.178009,,,1.743477
D,1.130018,0.897796,0.330866,


In [79]:
df[df>0]

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,,
B,,,,1.522562
C,0.178009,,,1.743477
D,1.130018,0.897796,0.330866,


In [82]:
df["W"] > 0

A     True
B    False
C     True
D     True
Name: W, dtype: bool

In [84]:
df[df['W']>0] #conditional selecting

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049


In [85]:
df[df['Z']<0] #all values where Z is less than 0

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
D,1.130018,0.897796,0.330866,-1.063049


In [86]:
df

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
B,-0.380104,-1.666059,-2.736995,1.522562
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049


In [87]:
resultdf = df[df['W']>0]

In [90]:
resultdf

Unnamed: 0,W,X,Y,Z
A,0.093628,1.240813,-1.097693,-1.908009
C,0.178009,-0.626805,-0.391089,1.743477
D,1.130018,0.897796,0.330866,-1.063049


In [92]:
resultdf['X']

A    1.240813
C   -0.626805
D    0.897796
Name: X, dtype: float64

In [94]:
df[df['W']>0]['X'] #returns the same thing as above

A    1.240813
C   -0.626805
D    0.897796
Name: X, dtype: float64

In [97]:
df[df['W']>0][['X','Z']]

Unnamed: 0,X,Z
A,1.240813,-1.908009
C,-0.626805,1.743477
D,0.897796,-1.063049


In [103]:
df[(df['W']>0) & (df['Y']>0)] #we cannot use the key words and, or for the expressions. Since Python's and compares two boolean value at a time. ex. True and False = False. but here the df['W']>0 returns a series of boolean values so it gets confused on which ones to use to compare. so we have to use & for and and | for or

Unnamed: 0,W,X,Y,Z
D,1.130018,0.897796,0.330866,-1.063049


In [110]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,0.093628,1.240813,-1.097693,-1.908009
1,B,-0.380104,-1.666059,-2.736995,1.522562
2,C,0.178009,-0.626805,-0.391089,1.743477
3,D,1.130018,0.897796,0.330866,-1.063049


In [114]:
newind = 'CA NY WY OR'.split()

In [116]:
df['States'] = newind

In [121]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.093628,1.240813,-1.097693,-1.908009
NY,-0.380104,-1.666059,-2.736995,1.522562
WY,0.178009,-0.626805,-0.391089,1.743477
OR,1.130018,0.897796,0.330866,-1.063049


In [14]:
#DataFrames - Part 3

In [15]:
import numpy as np
import pandas as pd

In [19]:
#Index Levels
outside = 'G1 G1 G1 G2 G2 G2'.split()
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [21]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [22]:
df = pd.DataFrame(randn(6,2), index=hier_index, columns=['A','B'])

In [24]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [26]:
df.loc['G1'] #gets everthing in G1

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [28]:
df.loc['G1'].loc[1]

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [31]:
df.index.names = ['Groups', 'Num']

In [33]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [35]:
df.loc['G2'].loc[2]['B']

0.07295967531703869

In [38]:
df['A'] #NOTE when indexing a column you don't do .loc you just do the [column_name] and it will get those values

Groups  Num
G1      1      0.302665
        2     -1.706086
        3     -0.134841
G2      1      0.166905
        2      0.807706
        3      0.638787
Name: A, dtype: float64

In [39]:
df.xs(1,level='Num') #this works as well

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502
