In [1]:
import numpy as np
import pandas as pd

In [2]:
my_data_frame = pd.DataFrame(np.random.randn(5,5), 
                             ['first row', 'second row', 'third row', 'fourth row', 'fitfth row'],
                             ['1st col', '2nd col', '3rd col', '4th col', '5th col'])
print(my_data_frame)

             1st col   2nd col   3rd col   4th col   5th col
first row  -1.066436  0.664521 -0.479841  0.617402 -0.011785
second row -1.042316  0.752747  0.208166  0.335706 -0.773191
third row   0.956853 -1.682262 -0.150610  0.266887 -1.672178
fourth row -0.277181  0.120764  1.242129  0.384651 -0.506904
fitfth row  0.215965 -0.026273 -1.011498  0.850856 -0.387502


In [3]:
print(my_data_frame['2nd col']) # With [], we will only access columns
print(my_data_frame[['2nd col', '3rd col']])

first row     0.664521
second row    0.752747
third row    -1.682262
fourth row    0.120764
fitfth row   -0.026273
Name: 2nd col, dtype: float64
             2nd col   3rd col
first row   0.664521 -0.479841
second row  0.752747  0.208166
third row  -1.682262 -0.150610
fourth row  0.120764  1.242129
fitfth row -0.026273 -1.011498


In [10]:
print(my_data_frame.loc['first row']) 

1st col   -1.066436
2nd col    0.664521
3rd col   -0.479841
4th col    0.617402
5th col   -0.011785
Name: first row, dtype: float64


In [11]:
# Even though we labled the data frame, the index number is still exists behind the scenes
print(my_data_frame.iloc[0])

1st col   -1.066436
2nd col    0.664521
3rd col   -0.479841
4th col    0.617402
5th col   -0.011785
Name: first row, dtype: float64


In [12]:
# To access certain rows  and certain columns
print(my_data_frame.loc[['second row', 'third row'],['3rd col', '4th col']])

             3rd col   4th col
second row  0.208166  0.335706
third row  -0.150610  0.266887


In [18]:
# Logical indexing of dataframes
print (my_data_frame > 0)

            1st col  2nd col  3rd col  4th col  5th col
first row      True     True     True     True     True
second row     True     True    False     True    False
third row     False    False    False    False    False
fourth row    False     True     True     True    False
fitfth row     True     True    False     True    False


In [19]:
print(my_data_frame[my_data_frame > 0]) # in Numpy, this will convert a matrix to vector, but in pandas it will keep the
# original data structure and put the values as  'NAN' where it doesn't meet the condition

             1st col   2nd col   3rd col   4th col   5th col
first row   0.471175  0.959021  0.836000  0.219969  0.315774
second row  0.371254  0.300764       NaN  0.562478       NaN
third row        NaN       NaN       NaN       NaN       NaN
fourth row       NaN  0.678928  0.883617  1.754514       NaN
fitfth row  0.571990  0.641257       NaN  0.086236       NaN


In [22]:
#Adding additional column to a data frame
my_data_frame['6th col'] = np.random.randn(5,1)
print(my_data_frame)

             1st col   2nd col   3rd col   4th col   5th col   6th col
first row   0.471175  0.959021  0.836000  0.219969  0.315774 -0.463640
second row  0.371254  0.300764 -0.038665  0.562478 -1.281660  0.338523
third row  -1.845972 -0.084987 -0.087259 -0.435929 -0.626105  0.256997
fourth row -0.692193  0.678928  0.883617  1.754514 -0.155511  0.121534
fitfth row  0.571990  0.641257 -0.631506  0.086236 -0.676589  0.846489


In [25]:
# Removing/dropping a columns
print(my_data_frame.drop('1st col', axis=1)) # removes the column from my_data_frame (axis=1 means column, 0 refers to row) and 
# returns a new data frame. It doesn't change the original dataframe
print(my_data_frame)

#Removing a row
print(my_data_frame.drop('third row', axis=0))

             2nd col   3rd col   4th col   5th col   6th col
first row   0.959021  0.836000  0.219969  0.315774 -0.463640
second row  0.300764 -0.038665  0.562478 -1.281660  0.338523
third row  -0.084987 -0.087259 -0.435929 -0.626105  0.256997
fourth row  0.678928  0.883617  1.754514 -0.155511  0.121534
fitfth row  0.641257 -0.631506  0.086236 -0.676589  0.846489
             1st col   2nd col   3rd col   4th col   5th col   6th col
first row   0.471175  0.959021  0.836000  0.219969  0.315774 -0.463640
second row  0.371254  0.300764 -0.038665  0.562478 -1.281660  0.338523
third row  -1.845972 -0.084987 -0.087259 -0.435929 -0.626105  0.256997
fourth row -0.692193  0.678928  0.883617  1.754514 -0.155511  0.121534
fitfth row  0.571990  0.641257 -0.631506  0.086236 -0.676589  0.846489
             1st col   2nd col   3rd col   4th col   5th col   6th col
first row   0.471175  0.959021  0.836000  0.219969  0.315774 -0.463640
second row  0.371254  0.300764 -0.038665  0.562478 -1.281660  0.33

In [26]:
# Removing an index
print(my_data_frame.reset_index()) # this resets the indexes to 0, 1,2,.. and makes the previous labels as a series with
# column name as index. Again this doesn't change the original dataframe.
print (my_data_frame)

        index   1st col   2nd col   3rd col   4th col   5th col   6th col
0   first row  0.471175  0.959021  0.836000  0.219969  0.315774 -0.463640
1  second row  0.371254  0.300764 -0.038665  0.562478 -1.281660  0.338523
2   third row -1.845972 -0.084987 -0.087259 -0.435929 -0.626105  0.256997
3  fourth row -0.692193  0.678928  0.883617  1.754514 -0.155511  0.121534
4  fitfth row  0.571990  0.641257 -0.631506  0.086236 -0.676589  0.846489
             1st col   2nd col   3rd col   4th col   5th col   6th col
first row   0.471175  0.959021  0.836000  0.219969  0.315774 -0.463640
second row  0.371254  0.300764 -0.038665  0.562478 -1.281660  0.338523
third row  -1.845972 -0.084987 -0.087259 -0.435929 -0.626105  0.256997
fourth row -0.692193  0.678928  0.883617  1.754514 -0.155511  0.121534
fitfth row  0.571990  0.641257 -0.631506  0.086236 -0.676589  0.846489


In [27]:
my_data_frame.reset_index(inplace=True) # inplace=true will make the changes apply to original data frame
print(my_data_frame)

        index   1st col   2nd col   3rd col   4th col   5th col   6th col
0   first row  0.471175  0.959021  0.836000  0.219969  0.315774 -0.463640
1  second row  0.371254  0.300764 -0.038665  0.562478 -1.281660  0.338523
2   third row -1.845972 -0.084987 -0.087259 -0.435929 -0.626105  0.256997
3  fourth row -0.692193  0.678928  0.883617  1.754514 -0.155511  0.121534
4  fitfth row  0.571990  0.641257 -0.631506  0.086236 -0.676589  0.846489


In [31]:
# adding a new index
# first we need to add a new series and set the index to the newly created series
my_data_frame['new series'] = ['data1','data2','data3','data4','data5']
print(my_data_frame)


                 index   1st col   2nd col   3rd col   4th col   5th col  \
new series                                                                 
data1        first row  0.471175  0.959021  0.836000  0.219969  0.315774   
data2       second row  0.371254  0.300764 -0.038665  0.562478 -1.281660   
data3        third row -1.845972 -0.084987 -0.087259 -0.435929 -0.626105   
data4       fourth row -0.692193  0.678928  0.883617  1.754514 -0.155511   
data5       fitfth row  0.571990  0.641257 -0.631506  0.086236 -0.676589   

             6th col new serries new series  
new series                                   
data1      -0.463640       data1      data1  
data2       0.338523       data2      data2  
data3       0.256997       data3      data3  
data4       0.121534       data4      data4  
data5       0.846489       data5      data5  


In [40]:
my_data_frame.drop('new series', axis=1, inplace=True)
#my_data_frame.set_index('new series', inplace=True)
print(my_data_frame)

                 index   1st col   2nd col   3rd col   4th col   5th col  \
new series                                                                 
data1        first row  0.471175  0.959021  0.836000  0.219969  0.315774   
data2       second row  0.371254  0.300764 -0.038665  0.562478 -1.281660   
data3        third row -1.845972 -0.084987 -0.087259 -0.435929 -0.626105   
data4       fourth row -0.692193  0.678928  0.883617  1.754514 -0.155511   
data5       fitfth row  0.571990  0.641257 -0.631506  0.086236 -0.676589   

             6th col  
new series            
data1      -0.463640  
data2       0.338523  
data3       0.256997  
data4       0.121534  
data5       0.846489  


In [41]:
my_data_frame.mean()

1st col   -0.224749
2nd col    0.498996
3rd col    0.192437
4th col    0.437454
5th col   -0.484818
6th col    0.219980
dtype: float64