#  Chapter 5 : Data Selection - DataFrames

In [1]:
import pandas as pd

## Introduction to DataFrames

### The need for data selection methods

In [3]:
GDP_data = pd.read_csv('Chapter5-Datasets/US_GDP_Industry.csv')
GDP_data

Unnamed: 0,period,Industry,GDP
0,2015_Q1,All industries,31917.8
1,2015_Q2,All industries,32266.2
2,2015_Q3,All industries,32406.6
3,2015_Q4,All industries,32298.7
4,2016_Q1,All industries,32303.8
...,...,...,...
2129,2019_Q2,Government enterprises,371.4
2130,2019_Q3,Government enterprises,373.5
2131,2019_Q4,Government enterprises,375.1
2132,2020_Q1,Government enterprises,372.8


## Data selection in pandas DataFrames

### The index and it's forms

In [4]:
GDP_data.index

RangeIndex(start=0, stop=2134, step=1)

In [5]:
range(0,2134)

range(0, 2134)

In [6]:
for i in range(0,2134,1):
    if(i>0 and i<10):
        print(i)

1
2
3
4
5
6
7
8
9


In [7]:
print(list(range(0,2134,1))[:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [8]:
for i in GDP_data.index:
    if(i>0 and i<10):
        print(i)

1
2
3
4
5
6
7
8
9


In [9]:
GDP_data.columns

Index(['period', 'Industry', 'GDP'], dtype='object')

## Exercise 5.01 - identifying the row and columns indices in a dataset

In [10]:
gas_turbine_data = pd.read_csv('Chapter5-Datasets/gt_2015.csv')
gt_columns = list(gas_turbine_data.columns)
print(gt_columns,'\n (', len(gt_columns), ' columns )')

['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'CO', 'NOX'] 
 ( 11  columns )


In [11]:
gas_turbine_data.index

RangeIndex(start=0, stop=7384, step=1)

In [12]:
gas_turbine_data['NOX'].max()

119.68

## Saving the index or columns

In [13]:
GDP_data_index = GDP_data.index
GDP_data_cols = GDP_data.columns
print('the index is type ',type(GDP_data_cols),'\n while the columns are type ',type(GDP_data_cols))
print('the second itme in the list is ', GDP_data_index[1], '\n and the second column is ', GDP_data_cols[1])

the index is type  <class 'pandas.core.indexes.base.Index'> 
 while the columns are type  <class 'pandas.core.indexes.base.Index'>
the second itme in the list is  1 
 and the second column is  Industry


In [14]:
bare_data = pd.read_csv('Chapter5-Datasets/bare_csv.csv', header = None)
print(bare_data.index, bare_data.columns)

RangeIndex(start=0, stop=19, step=1) Int64Index([0, 1], dtype='int64')


## Slicing and Indexing Methods

In [16]:
labels_vs_integers=pd.DataFrame({'values': [6,1,5,2], 'names':['oranges','apples','bananas', 'pears']})
labels_vs_integers

Unnamed: 0,values,names
0,6,oranges
1,1,apples
2,5,bananas
3,2,pears


In [17]:
labels_vs_integers.index = ['citrus','non-citrus','non_citrus','non_citrus']
labels_vs_integers

Unnamed: 0,values,names
citrus,6,oranges
non-citrus,1,apples
non_citrus,5,bananas
non_citrus,2,pears


In [19]:
print(labels_vs_integers.iloc[1:4,1])
print()
print(labels_vs_integers.loc['non_citrus', 'names'])

non-citrus     apples
non_citrus    bananas
non_citrus      pears
Name: names, dtype: object

non_citrus    bananas
non_citrus      pears
Name: names, dtype: object


In [21]:
print('using .iloc alone: ', labels_vs_integers.iloc[1,1])
print('using .loc to subset first, then using .iloc: ',labels_vs_integers.loc['non_citrus', :].iloc[1,1])

using .iloc alone:  apples
using .loc to subset first, then using .iloc:  pears


In [23]:
int_labels_vs_integers = pd.DataFrame({'species': ['feline','canine', 'canine','feline'], 'name': ['housecat','wolf', 'dingo', 'tiger']})
int_labels_vs_integers

Unnamed: 0,species,name
0,feline,housecat
1,canine,wolf
2,canine,dingo
3,feline,tiger


In [25]:
int_labels_vs_integers = int_labels_vs_integers.iloc[[0,2,3],:]
int_labels_vs_integers

Unnamed: 0,species,name
0,feline,housecat
2,canine,dingo
3,feline,tiger


In [26]:
int_labels_vs_integers.iloc[2,:]

species    feline
name        tiger
Name: 3, dtype: object

In [27]:
GDP_summary = GDP_data.iloc[:,[0,2]]
GDP_summary.head()

Unnamed: 0,period,GDP
0,2015_Q1,31917.8
1,2015_Q2,32266.2
2,2015_Q3,32406.6
3,2015_Q4,32298.7
4,2016_Q1,32303.8


In [28]:
GDP_summary=GDP_data.loc[:, ['period','GDP']]
GDP_summary

Unnamed: 0,period,GDP
0,2015_Q1,31917.8
1,2015_Q2,32266.2
2,2015_Q3,32406.6
3,2015_Q4,32298.7
4,2016_Q1,32303.8
...,...,...
2129,2019_Q2,371.4
2130,2019_Q3,373.5
2131,2019_Q4,375.1
2132,2020_Q1,372.8


In [29]:
gas_turbine_data = pd.read_csv('Chapter5-Datasets/gt_2015.csv')
gas_turbine_data.columns

Index(['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'CO',
       'NOX'],
      dtype='object')

In [30]:
non_emissions_turbine_data = gas_turbine_data.loc[:,'AT':'CDP']
non_emissions_turbine_data.head()

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP
0,1.9532,1020.1,84.985,2.5304,20.116,1048.7,544.92,116.27,10.799
1,1.2191,1020.1,87.523,2.3937,18.584,1045.5,548.5,109.18,10.347
2,0.94915,1022.2,78.335,2.7789,22.264,1068.8,549.95,125.88,11.256
3,1.0075,1021.7,76.942,2.817,23.358,1075.2,549.63,132.21,11.702
4,1.2858,1021.6,76.732,2.8377,23.483,1076.2,549.68,133.58,11.737


### Boolean indexing

In [31]:
gas_turbine_data.loc[:, (gas_turbine_data.columns=='NOX') | (gas_turbine_data.columns=='CO')]

Unnamed: 0,CO,NOX
0,7.4491,113.250
1,6.4684,112.020
2,3.6335,88.147
3,3.1972,87.078
4,2.3833,82.515
...,...,...
7379,10.9930,89.172
7380,11.1440,88.849
7381,11.4140,96.147
7382,3.3134,64.738


## Exercise 5.02 - subsetting rows and columns

In [32]:
gas_turbine_data = pd.read_csv('Chapter5-Datasets/gt_2015.csv')

In [33]:
print(gas_turbine_data.shape)
print(gas_turbine_data.columns)

(7384, 11)
Index(['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'CO',
       'NOX'],
      dtype='object')


In [34]:
print(gas_turbine_data.iloc[:100, :].loc[:,['CO','NOX']].describe())
print(gas_turbine_data.loc[:,['CO','NOX']].describe())

               CO         NOX
count  100.000000  100.000000
mean     3.774012   77.661970
std      1.774795   13.708632
min      0.475440   58.432000
25%      2.656625   64.672000
50%      3.501650   78.084000
75%      4.078250   85.121250
max     12.659000  118.270000
                CO          NOX
count  7384.000000  7384.000000
mean      3.129986    59.890509
std       2.234962    11.132464
min       0.212800    25.905000
25%       1.808175    52.399000
50%       2.533400    56.838500
75%       3.702550    65.093250
max      41.097000   119.680000


## Using labels as the index and the pandas multi-index

In [36]:
species = pd.Series(['cat','dog','pig','chicken'])
species = species.sample(100, replace = True, random_state=1).reset_index(drop=True)
location = pd.Series(['city','town','farm'])
location = location.sample(100, replace = True, random_state=2).reset_index(drop=True)
weight = pd.Series(range(10,110)).reset_index(drop=True)
fur = pd.Series(['long','short'])
fur = fur.sample(100,replace = True, random_state = 3).reset_index(drop=True)
color = pd.Series(['solid','spotted','striped'])
color = color.sample(100, replace = True, random_state =42).reset_index(drop = True)
animals = pd.DataFrame({'species':species, 'location':location, 'weight': weight, 'color': color, 'fur': fur})

In [38]:
pd.options.display.float_format = '{:,.2f}'.format
animals.sort_values(['location','species']).pivot_table(index = ['location','species'], columns = ['color','fur'], aggfunc = 'mean', fill_value='')

Unnamed: 0_level_0,Unnamed: 1_level_0,weight,weight,weight,weight,weight,weight
Unnamed: 0_level_1,color,solid,solid,spotted,spotted,striped,striped
Unnamed: 0_level_2,fur,long,short,long,short,long,short
location,species,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
city,cat,74.0,23.5,33.0,84.0,20.0,12.0
city,chicken,44.0,91.33,63.67,,103.0,87.0
city,dog,15.0,44.0,64.0,75.0,39.0,51.0
city,pig,,86.0,40.0,,39.0,
farm,cat,49.0,85.0,85.0,82.0,69.0,24.33
farm,chicken,59.0,,81.0,43.0,16.0,75.67
farm,dog,102.0,99.0,49.2,58.0,44.5,
farm,pig,30.0,36.0,65.0,54.5,47.0,49.0
town,cat,,108.0,60.5,64.75,37.0,
town,chicken,52.5,81.0,,,,44.25


In [39]:
GDP_by_industry = pd.read_csv('Chapter5-Datasets/US_GDP_Industry.csv', index_col = [0,1]).sort_index(level = [0,1])
GDP_by_industry

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP
period,Industry,Unnamed: 2_level_1
2015_Q1,Accommodation,256.20
2015_Q1,Accommodation and food services,973.60
2015_Q1,Administrative and support services,795.70
2015_Q1,Administrative and waste management services,883.00
2015_Q1,"Agriculture, forestry, fishing, and hunting",466.30
...,...,...
2020_Q2,Warehousing and storage,135.10
2020_Q2,Waste management and remediation services,100.60
2020_Q2,Water transportation,32.00
2020_Q2,Wholesale trade,1810.90


In [40]:
GDP_by_industry.loc[('2017_Q2','Farms'):('2017_Q2','Finance and insurance')]

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP
period,Industry,Unnamed: 2_level_1
2017_Q2,Farms,399.7
2017_Q2,Federal,1124.6
2017_Q2,"Federal Reserve banks, credit intermediation, and related activities",926.5
2017_Q2,Finance and insurance,2827.1


In [41]:
GDP_by_industry = pd.read_csv('Chapter5-Datasets/US_GDP_Industry.csv')
GDP_by_industry.loc[((GDP_by_industry['Industry']=='Farms')|(GDP_by_industry['Industry']=='Federal')| (GDP_by_industry['Industry']=='Federal Reserve banks, credit intermediation, and related activities')| (GDP_by_industry['Industry']=='Finance and insurance')) & (GDP_by_industry['period']=='2017_Q2'),:]

Unnamed: 0,period,Industry,GDP
75,2017_Q2,Farms,399.7
1197,2017_Q2,Finance and insurance,2827.1
1219,2017_Q2,"Federal Reserve banks, credit intermediation, ...",926.5
1967,2017_Q2,Federal,1124.6


In [42]:
GDP_by_industry.iloc[[75,1197,1219,1967],:]

Unnamed: 0,period,Industry,GDP
75,2017_Q2,Farms,399.7
1197,2017_Q2,Finance and insurance,2827.1
1219,2017_Q2,"Federal Reserve banks, credit intermediation, ...",926.5
1967,2017_Q2,Federal,1124.6


## Creating a multi-index from columns

In [43]:
GDP_by_industry = pd.read_csv('Chapter5-Datasets/US_GDP_industry.csv')
GDP_by_industry.head()

Unnamed: 0,period,Industry,GDP
0,2015_Q1,All industries,31917.8
1,2015_Q2,All industries,32266.2
2,2015_Q3,All industries,32406.6
3,2015_Q4,All industries,32298.7
4,2016_Q1,All industries,32303.8


In [44]:
GDP_index = pd.MultiIndex.from_frame(GDP_by_industry[['period','Industry']])
GDP_index

MultiIndex([('2015_Q1',         'All industries'),
            ('2015_Q2',         'All industries'),
            ('2015_Q3',         'All industries'),
            ('2015_Q4',         'All industries'),
            ('2016_Q1',         'All industries'),
            ('2016_Q2',         'All industries'),
            ('2016_Q3',         'All industries'),
            ('2016_Q4',         'All industries'),
            ('2017_Q1',         'All industries'),
            ('2017_Q2',         'All industries'),
            ...
            ('2018_Q1', 'Government enterprises'),
            ('2018_Q2', 'Government enterprises'),
            ('2018_Q3', 'Government enterprises'),
            ('2018_Q4', 'Government enterprises'),
            ('2019_Q1', 'Government enterprises'),
            ('2019_Q2', 'Government enterprises'),
            ('2019_Q3', 'Government enterprises'),
            ('2019_Q4', 'Government enterprises'),
            ('2020_Q1', 'Government enterprises'),
            ('2

In [45]:
GDP_by_industry.drop(columns = ['period','Industry'],inplace=True)
GDP_by_industry.set_index(GDP_index, drop=True, inplace=True)
GDP_by_industry

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP
period,Industry,Unnamed: 2_level_1
2015_Q1,All industries,31917.80
2015_Q2,All industries,32266.20
2015_Q3,All industries,32406.60
2015_Q4,All industries,32298.70
2016_Q1,All industries,32303.80
...,...,...
2019_Q2,Government enterprises,371.40
2019_Q3,Government enterprises,373.50
2019_Q4,Government enterprises,375.10
2020_Q1,Government enterprises,372.80


In [46]:
GDP_by_industry.sort_values(by = ['period', 'Industry'], inplace=True)
GDP_by_industry

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP
period,Industry,Unnamed: 2_level_1
2015_Q1,Accommodation,256.20
2015_Q1,Accommodation and food services,973.60
2015_Q1,Administrative and support services,795.70
2015_Q1,Administrative and waste management services,883.00
2015_Q1,"Agriculture, forestry, fishing, and hunting",466.30
...,...,...
2020_Q2,Warehousing and storage,135.10
2020_Q2,Waste management and remediation services,100.60
2020_Q2,Water transportation,32.00
2020_Q2,Wholesale trade,1810.90
