In [1]:
import pandas as pd
import numpy as np

brics = {'country': ['Brazil', 'Russia', 'India', 'China', 'South Africa'],
         'capital': ['Brasilia', 'Moscow', 'New Delhi', 'Beijing', 'Pretoria'],
         'area': [8.516, 17.10, 3.286, 9.597, 1.221],
         'population': [200.4, 143.5, 1252, 1357, 52.98] }

brics = pd.DataFrame(brics)

In [2]:
print(brics.head()) # first 5 rows of the dataframe
print(brics.tail()) # last 5 rows of the dataframe
print('\n',brics.info()) # info about the dataframe
print('\n',brics.describe()) # summary statistics of the dataframe. Only for numeric columns. 

        country    capital    area  population
0        Brazil   Brasilia   8.516      200.40
1        Russia     Moscow  17.100      143.50
2         India  New Delhi   3.286     1252.00
3         China    Beijing   9.597     1357.00
4  South Africa   Pretoria   1.221       52.98
        country    capital    area  population
0        Brazil   Brasilia   8.516      200.40
1        Russia     Moscow  17.100      143.50
2         India  New Delhi   3.286     1252.00
3         China    Beijing   9.597     1357.00
4  South Africa   Pretoria   1.221       52.98
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     5 non-null      object 
 1   capital     5 non-null      object 
 2   area        5 non-null      float64
 3   population  5 non-null      float64
dtypes: float64(2), object(2)
memory usage: 292.0+ bytes

 None

             area   popul

In [3]:
print(brics.values) # values of the dataframe as a 2d numpy array
print(type(brics.values)) # type of the values of the dataframe

[['Brazil' 'Brasilia' 8.516 200.4]
 ['Russia' 'Moscow' 17.1 143.5]
 ['India' 'New Delhi' 3.286 1252.0]
 ['China' 'Beijing' 9.597 1357.0]
 ['South Africa' 'Pretoria' 1.221 52.98]]
<class 'numpy.ndarray'>


In [4]:
print(brics.columns) # column names of the dataframe
print(brics.index) # row labels of the dataframe

Index(['country', 'capital', 'area', 'population'], dtype='object')
RangeIndex(start=0, stop=5, step=1)


In [5]:
arr = np.array(brics)
print(arr) # values of the dataframe as a 2d numpy array

[['Brazil' 'Brasilia' 8.516 200.4]
 ['Russia' 'Moscow' 17.1 143.5]
 ['India' 'New Delhi' 3.286 1252.0]
 ['China' 'Beijing' 9.597 1357.0]
 ['South Africa' 'Pretoria' 1.221 52.98]]


In [6]:
# Sorting the rows of a DataFrame

brics.sort_values('area',ascending=False,inplace=True) # sort the rows of a DataFrame by a specific column. Ascending atrribute can be omitted as the default order is ascending.
                                                        # the inplace attribute is used to sort the dataframe in place. If it is set to False or omitted, the dataframe will not be affected.
print(brics)

# To sort the dataframe by multiple columns, pass a list of column names to the sort_values() method.
brics.sort_values(['area','population'],ascending=[True,True],inplace=True) # sort the rows of a DataFrame by multiple columns. The ascending attribute is a list of booleans where each boolean corresponds to the column in the dataframe.
print('\n',brics)

        country    capital    area  population
1        Russia     Moscow  17.100      143.50
3         China    Beijing   9.597     1357.00
0        Brazil   Brasilia   8.516      200.40
2         India  New Delhi   3.286     1252.00
4  South Africa   Pretoria   1.221       52.98

         country    capital    area  population
4  South Africa   Pretoria   1.221       52.98
2         India  New Delhi   3.286     1252.00
0        Brazil   Brasilia   8.516      200.40
3         China    Beijing   9.597     1357.00
1        Russia     Moscow  17.100      143.50


In [7]:
# subsetting dataframes

print(brics['country']) # subset a dataframe by selecting a single column. The result is a pandas series.
print(brics[['country']]) # subset a dataframe by selecting a single column. The result is a pandas dataframe.

print(brics[brics['area'] > 8]) # subset a dataframe by selecting rows where a column has a certain value. The result is a pandas dataframe.

4    South Africa
2           India
0          Brazil
3           China
1          Russia
Name: country, dtype: object
        country
4  South Africa
2         India
0        Brazil
3         China
1        Russia
  country   capital    area  population
0  Brazil  Brasilia   8.516       200.4
3   China   Beijing   9.597      1357.0
1  Russia    Moscow  17.100       143.5


In [8]:
condition = brics['country'].isin(['Brazil','Russia','India']) # Works like the or operator. The result is a pandas series of booleans.
print(brics[condition])

  country    capital    area  population
2   India  New Delhi   3.286      1252.0
0  Brazil   Brasilia   8.516       200.4
1  Russia     Moscow  17.100       143.5


In [9]:
dogs = pd.DataFrame({'name':['Bella','Charlie','Lucy'],'height':[1,2,3],'weight':[40,60,50]})
print(dogs)
print(dogs['weight']/dogs['height']**2)
dogs['bmi'] = dogs['weight']/dogs['height']**2
print(dogs)
del dogs['bmi']
print(dogs)

      name  height  weight
0    Bella       1      40
1  Charlie       2      60
2     Lucy       3      50
0    40.000000
1    15.000000
2     5.555556
dtype: float64
      name  height  weight        bmi
0    Bella       1      40  40.000000
1  Charlie       2      60  15.000000
2     Lucy       3      50   5.555556
      name  height  weight
0    Bella       1      40
1  Charlie       2      60
2     Lucy       3      50


In [10]:
print(dogs.iloc[[0]]) # select the first row of a dataframe as a dataframe
print(dogs.iloc[0]) # select the first row of a dataframe as a series

    name  height  weight
0  Bella       1      40
name      Bella
height        1
weight       40
Name: 0, dtype: object


Doing stuff with indexes in pandas. The advantage of working with indexes is that we can use loc and iloc methods for easier lookups.

In [39]:
# Setting a column as the index of a DataFrame
dogs_name = dogs.set_index('name') # set the name column as the index of the dogs dataframe. The result is a new dataframe.
print(dogs_name)

dogs_bella = dogs_name.loc['Bella'] # select the row with the index 'bella' as a series
print('\n',dogs_bella)

dogs_bella = dogs_name.loc[['Bella']] # select the row with the index 'bella' as a dataframe
print('\n',dogs_bella)

# To reset the index of a dataframe, use the reset_index() method. The reset_index() method returns a new dataframe.
dogs_reset = dogs_name.reset_index();
print('\n',dogs_reset)

# To reset the index and drop the column that was in the index before resetting it, use the reset_index() method with the drop=True attribute.
dogs_reset = dogs_name.reset_index(drop=True);
print('\n',dogs_reset)

# multiple columns can also be set as index but there will be a multi-level index. which means that the inner lovel is nested inside the outer level.

dogs_indes = dogs.set_index(['name','height'])
print('\n',dogs_indes)

dogs_indes = dogs_indes.loc[['Bella','Lucy']]
print('\n',dogs_indes)

# To also select the inner level of a multi-level index, use the tuple notation.
print('\n',dogs_indes.loc[[('Bella',1),('Lucy',3)]])

# To sort the index of a dataframe, use the sort_index() method. The sort_index() method returns a new dataframe.
dogs_indes = dogs_indes.sort_index()
print('\n',dogs_indes)

# To control the order of the index levels, use the sort_index() method with the level attribute. The sort_index() method returns a new dataframe.
dogs_indes = dogs_indes.sort_index(level=['name','height'],ascending=[True,False])
print('\n',dogs_indes)

         height  weight
name                   
Bella         1      40
Charlie       2      60
Lucy          3      50

 height     1
weight    40
Name: Bella, dtype: int64

        height  weight
name                 
Bella       1      40

       name  height  weight
0    Bella       1      40
1  Charlie       2      60
2     Lucy       3      50

    height  weight
0       1      40
1       2      60
2       3      50

                 weight
name    height        
Bella   1           40
Charlie 2           60
Lucy    3           50

               weight
name  height        
Bella 1           40
Lucy  3           50

               weight
name  height        
Bella 1           40
Lucy  3           50

               weight
name  height        
Bella 1           40
Lucy  3           50

               weight
name  height        
Bella 1           40
Lucy  3           50
