#  Chapter 5 : Data Selection - DataFrames

In [1]:
import pandas as pd

## Introduction to DataFrames

### The need for data selection methods

In [2]:
GDP_data = pd.read_csv('Chapter5-Datasets/US_GDP_Industry.csv')
GDP_data

Unnamed: 0,period,Industry,GDP
0,2015_Q1,All industries,31917.8
1,2015_Q2,All industries,32266.2
2,2015_Q3,All industries,32406.6
3,2015_Q4,All industries,32298.7
4,2016_Q1,All industries,32303.8
...,...,...,...
2129,2019_Q2,Government enterprises,371.4
2130,2019_Q3,Government enterprises,373.5
2131,2019_Q4,Government enterprises,375.1
2132,2020_Q1,Government enterprises,372.8


## Data selection in pandas DataFrames

### The index and it's forms

In [3]:
GDP_data.index

RangeIndex(start=0, stop=2134, step=1)

In [4]:
range(0,2134)

range(0, 2134)

In [5]:
for i in range(0,2134,1):
    if(i>0 and i<10):
        print(i)

1
2
3
4
5
6
7
8
9


In [6]:
print(list(range(0,2134,1))[:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [7]:
for i in GDP_data.index:
    if(i>0 and i<10):
        print(i)

1
2
3
4
5
6
7
8
9


In [8]:
GDP_data.columns

Index(['period', 'Industry', 'GDP'], dtype='object')

## Exercise 5.01 - identifying the row and columns indices in a dataset

In [9]:
gas_turbine_data = pd.read_csv('Chapter5-Datasets/gt_2015.csv')
gt_columns = list(gas_turbine_data.columns)
print(gt_columns,'\n (', len(gt_columns), ' columns )')

['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'CO', 'NOX'] 
 ( 11  columns )


In [10]:
gas_turbine_data.index

RangeIndex(start=0, stop=7384, step=1)

In [11]:
gas_turbine_data['NOX'].max()

119.68

## Saving the index or columns

In [12]:
GDP_data_index = GDP_data.index
GDP_data_cols = GDP_data.columns
print('the index is type ',type(GDP_data_cols),'\n while the columns are type ',type(GDP_data_cols))
print('the second itme in the list is ', GDP_data_index[1], '\n and the second column is ', GDP_data_cols[1])

the index is type  <class 'pandas.core.indexes.base.Index'> 
 while the columns are type  <class 'pandas.core.indexes.base.Index'>
the second itme in the list is  1 
 and the second column is  Industry


In [13]:
bare_data = pd.read_csv('Chapter5-Datasets/bare_csv.csv', header = None)
print(bare_data.index, bare_data.columns)

RangeIndex(start=0, stop=19, step=1) Int64Index([0, 1], dtype='int64')


## Slicing and Indexing Methods

In [14]:
labels_vs_integers=pd.DataFrame({'values': [6,1,5,2], 'names':['oranges','apples','bananas', 'pears']})
labels_vs_integers

Unnamed: 0,values,names
0,6,oranges
1,1,apples
2,5,bananas
3,2,pears


In [15]:
labels_vs_integers.index = ['citrus','non-citrus','non_citrus','non_citrus']
labels_vs_integers

Unnamed: 0,values,names
citrus,6,oranges
non-citrus,1,apples
non_citrus,5,bananas
non_citrus,2,pears


In [16]:
print(labels_vs_integers.iloc[1:4,1])
print()
print(labels_vs_integers.loc['non_citrus', 'names'])

non-citrus     apples
non_citrus    bananas
non_citrus      pears
Name: names, dtype: object

non_citrus    bananas
non_citrus      pears
Name: names, dtype: object


In [17]:
print('using .iloc alone: ', labels_vs_integers.iloc[1,1])
print('using .loc to subset first, then using .iloc: ',labels_vs_integers.loc['non_citrus', :].iloc[1,1])

using .iloc alone:  apples
using .loc to subset first, then using .iloc:  pears


In [18]:
int_labels_vs_integers = pd.DataFrame({'species': ['feline','canine', 'canine','feline'], 'name': ['housecat','wolf', 'dingo', 'tiger']})
int_labels_vs_integers

Unnamed: 0,species,name
0,feline,housecat
1,canine,wolf
2,canine,dingo
3,feline,tiger


In [19]:
int_labels_vs_integers = int_labels_vs_integers.iloc[[0,2,3],:]
int_labels_vs_integers

Unnamed: 0,species,name
0,feline,housecat
2,canine,dingo
3,feline,tiger


In [20]:
int_labels_vs_integers.iloc[2,:]

species    feline
name        tiger
Name: 3, dtype: object

In [21]:
GDP_summary = GDP_data.iloc[:,[0,2]]
GDP_summary.head()

Unnamed: 0,period,GDP
0,2015_Q1,31917.8
1,2015_Q2,32266.2
2,2015_Q3,32406.6
3,2015_Q4,32298.7
4,2016_Q1,32303.8


In [22]:
GDP_summary=GDP_data.loc[:, ['period','GDP']]
GDP_summary

Unnamed: 0,period,GDP
0,2015_Q1,31917.8
1,2015_Q2,32266.2
2,2015_Q3,32406.6
3,2015_Q4,32298.7
4,2016_Q1,32303.8
...,...,...
2129,2019_Q2,371.4
2130,2019_Q3,373.5
2131,2019_Q4,375.1
2132,2020_Q1,372.8


In [23]:
gas_turbine_data = pd.read_csv('Chapter5-Datasets/gt_2015.csv')
gas_turbine_data.columns

Index(['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'CO',
       'NOX'],
      dtype='object')

In [24]:
non_emissions_turbine_data = gas_turbine_data.loc[:,'AT':'CDP']
non_emissions_turbine_data.head()

Unnamed: 0,AT,AP,AH,AFDP,GTEP,TIT,TAT,TEY,CDP
0,1.9532,1020.1,84.985,2.5304,20.116,1048.7,544.92,116.27,10.799
1,1.2191,1020.1,87.523,2.3937,18.584,1045.5,548.5,109.18,10.347
2,0.94915,1022.2,78.335,2.7789,22.264,1068.8,549.95,125.88,11.256
3,1.0075,1021.7,76.942,2.817,23.358,1075.2,549.63,132.21,11.702
4,1.2858,1021.6,76.732,2.8377,23.483,1076.2,549.68,133.58,11.737


### Boolean indexing

In [25]:
gas_turbine_data.loc[:, (gas_turbine_data.columns=='NOX') | (gas_turbine_data.columns=='CO')]

Unnamed: 0,CO,NOX
0,7.4491,113.250
1,6.4684,112.020
2,3.6335,88.147
3,3.1972,87.078
4,2.3833,82.515
...,...,...
7379,10.9930,89.172
7380,11.1440,88.849
7381,11.4140,96.147
7382,3.3134,64.738


## Exercise 5.02 - subsetting rows and columns

In [26]:
gas_turbine_data = pd.read_csv('Chapter5-Datasets/gt_2015.csv')

In [27]:
print(gas_turbine_data.shape)
print(gas_turbine_data.columns)

(7384, 11)
Index(['AT', 'AP', 'AH', 'AFDP', 'GTEP', 'TIT', 'TAT', 'TEY', 'CDP', 'CO',
       'NOX'],
      dtype='object')


In [28]:
print(gas_turbine_data.iloc[:100, :].loc[:,['CO','NOX']].describe())
print(gas_turbine_data.loc[:,['CO','NOX']].describe())

               CO         NOX
count  100.000000  100.000000
mean     3.774012   77.661970
std      1.774795   13.708632
min      0.475440   58.432000
25%      2.656625   64.672000
50%      3.501650   78.084000
75%      4.078250   85.121250
max     12.659000  118.270000
                CO          NOX
count  7384.000000  7384.000000
mean      3.129986    59.890509
std       2.234962    11.132464
min       0.212800    25.905000
25%       1.808175    52.399000
50%       2.533400    56.838500
75%       3.702550    65.093250
max      41.097000   119.680000


## Using labels as the index and the pandas multi-index

In [29]:
species = pd.Series(['cat','dog','pig','chicken'])
species = species.sample(100, replace = True, random_state=1).reset_index(drop=True)
location = pd.Series(['city','town','farm'])
location = location.sample(100, replace = True, random_state=2).reset_index(drop=True)
weight = pd.Series(range(10,110)).reset_index(drop=True)
fur = pd.Series(['long','short'])
fur = fur.sample(100,replace = True, random_state = 3).reset_index(drop=True)
color = pd.Series(['solid','spotted','striped'])
color = color.sample(100, replace = True, random_state =42).reset_index(drop = True)
animals = pd.DataFrame({'species':species, 'location':location, 'weight': weight, 'color': color, 'fur': fur})

In [30]:
pd.options.display.float_format = '{:,.2f}'.format
animals.sort_values(['location','species']).pivot_table(index = ['location','species'], columns = ['color','fur'], aggfunc = 'mean', fill_value='')

Unnamed: 0_level_0,Unnamed: 1_level_0,weight,weight,weight,weight,weight,weight
Unnamed: 0_level_1,color,solid,solid,spotted,spotted,striped,striped
Unnamed: 0_level_2,fur,long,short,long,short,long,short
location,species,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
city,cat,74.0,23.5,33.0,84.0,20.0,12.0
city,chicken,44.0,91.33,63.67,,103.0,87.0
city,dog,15.0,44.0,64.0,75.0,39.0,51.0
city,pig,,86.0,40.0,,39.0,
farm,cat,49.0,85.0,85.0,82.0,69.0,24.33
farm,chicken,59.0,,81.0,43.0,16.0,75.67
farm,dog,102.0,99.0,49.2,58.0,44.5,
farm,pig,30.0,36.0,65.0,54.5,47.0,49.0
town,cat,,108.0,60.5,64.75,37.0,
town,chicken,52.5,81.0,,,,44.25


In [31]:
GDP_by_industry = pd.read_csv('Chapter5-Datasets/US_GDP_Industry.csv', index_col = [0,1]).sort_index(level = [0,1])
GDP_by_industry

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP
period,Industry,Unnamed: 2_level_1
2015_Q1,Accommodation,256.20
2015_Q1,Accommodation and food services,973.60
2015_Q1,Administrative and support services,795.70
2015_Q1,Administrative and waste management services,883.00
2015_Q1,"Agriculture, forestry, fishing, and hunting",466.30
...,...,...
2020_Q2,Warehousing and storage,135.10
2020_Q2,Waste management and remediation services,100.60
2020_Q2,Water transportation,32.00
2020_Q2,Wholesale trade,1810.90


In [32]:
GDP_by_industry.loc[('2017_Q2','Farms'):('2017_Q2','Finance and insurance')]

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP
period,Industry,Unnamed: 2_level_1
2017_Q2,Farms,399.7
2017_Q2,Federal,1124.6
2017_Q2,"Federal Reserve banks, credit intermediation, and related activities",926.5
2017_Q2,Finance and insurance,2827.1


In [33]:
GDP_by_industry = pd.read_csv('Chapter5-Datasets/US_GDP_Industry.csv')
GDP_by_industry.loc[((GDP_by_industry['Industry']=='Farms')|(GDP_by_industry['Industry']=='Federal')| (GDP_by_industry['Industry']=='Federal Reserve banks, credit intermediation, and related activities')| (GDP_by_industry['Industry']=='Finance and insurance')) & (GDP_by_industry['period']=='2017_Q2'),:]

Unnamed: 0,period,Industry,GDP
75,2017_Q2,Farms,399.7
1197,2017_Q2,Finance and insurance,2827.1
1219,2017_Q2,"Federal Reserve banks, credit intermediation, ...",926.5
1967,2017_Q2,Federal,1124.6


In [34]:
GDP_by_industry.iloc[[75,1197,1219,1967],:]

Unnamed: 0,period,Industry,GDP
75,2017_Q2,Farms,399.7
1197,2017_Q2,Finance and insurance,2827.1
1219,2017_Q2,"Federal Reserve banks, credit intermediation, ...",926.5
1967,2017_Q2,Federal,1124.6


## Creating a multi-index from columns

In [35]:
GDP_by_industry = pd.read_csv('Chapter5-Datasets/US_GDP_industry.csv')
GDP_by_industry.head()

Unnamed: 0,period,Industry,GDP
0,2015_Q1,All industries,31917.8
1,2015_Q2,All industries,32266.2
2,2015_Q3,All industries,32406.6
3,2015_Q4,All industries,32298.7
4,2016_Q1,All industries,32303.8


In [36]:
GDP_index = pd.MultiIndex.from_frame(GDP_by_industry[['period','Industry']])
GDP_index

MultiIndex([('2015_Q1',         'All industries'),
            ('2015_Q2',         'All industries'),
            ('2015_Q3',         'All industries'),
            ('2015_Q4',         'All industries'),
            ('2016_Q1',         'All industries'),
            ('2016_Q2',         'All industries'),
            ('2016_Q3',         'All industries'),
            ('2016_Q4',         'All industries'),
            ('2017_Q1',         'All industries'),
            ('2017_Q2',         'All industries'),
            ...
            ('2018_Q1', 'Government enterprises'),
            ('2018_Q2', 'Government enterprises'),
            ('2018_Q3', 'Government enterprises'),
            ('2018_Q4', 'Government enterprises'),
            ('2019_Q1', 'Government enterprises'),
            ('2019_Q2', 'Government enterprises'),
            ('2019_Q3', 'Government enterprises'),
            ('2019_Q4', 'Government enterprises'),
            ('2020_Q1', 'Government enterprises'),
            ('2

In [37]:
GDP_by_industry.drop(columns = ['period','Industry'],inplace=True)
GDP_by_industry.set_index(GDP_index, drop=True, inplace=True)
GDP_by_industry

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP
period,Industry,Unnamed: 2_level_1
2015_Q1,All industries,31917.80
2015_Q2,All industries,32266.20
2015_Q3,All industries,32406.60
2015_Q4,All industries,32298.70
2016_Q1,All industries,32303.80
...,...,...
2019_Q2,Government enterprises,371.40
2019_Q3,Government enterprises,373.50
2019_Q4,Government enterprises,375.10
2020_Q1,Government enterprises,372.80


In [38]:
GDP_by_industry.sort_values(by = ['period', 'Industry'], inplace=True)
GDP_by_industry

Unnamed: 0_level_0,Unnamed: 1_level_0,GDP
period,Industry,Unnamed: 2_level_1
2015_Q1,Accommodation,256.20
2015_Q1,Accommodation and food services,973.60
2015_Q1,Administrative and support services,795.70
2015_Q1,Administrative and waste management services,883.00
2015_Q1,"Agriculture, forestry, fishing, and hunting",466.30
...,...,...
2020_Q2,Warehousing and storage,135.10
2020_Q2,Waste management and remediation services,100.60
2020_Q2,Water transportation,32.00
2020_Q2,Wholesale trade,1810.90


## Bracket and Dot Notation

In [39]:
GDP_by_industry = pd.read_csv('Chapter5-Datasets/US_GDP_Industry.csv')
print(type(GDP_by_industry['GDP']),'\n',GDP_by_industry['GDP'])

<class 'pandas.core.series.Series'> 
 0      31,917.80
1      32,266.20
2      32,406.60
3      32,298.70
4      32,303.80
          ...   
2129      371.40
2130      373.50
2131      375.10
2132      372.80
2133      346.00
Name: GDP, Length: 2134, dtype: float64


In [40]:
print(type(GDP_by_industry.GDP),'\n',GDP_by_industry.GDP)

<class 'pandas.core.series.Series'> 
 0      31,917.80
1      32,266.20
2      32,406.60
3      32,298.70
4      32,303.80
          ...   
2129      371.40
2130      373.50
2131      375.10
2132      372.80
2133      346.00
Name: GDP, Length: 2134, dtype: float64


In [41]:
print(type(GDP_by_industry[['period','GDP']]),'\n',GDP_by_industry[['period','GDP']])

<class 'pandas.core.frame.DataFrame'> 
        period       GDP
0     2015_Q1 31,917.80
1     2015_Q2 32,266.20
2     2015_Q3 32,406.60
3     2015_Q4 32,298.70
4     2016_Q1 32,303.80
...       ...       ...
2129  2019_Q2    371.40
2130  2019_Q3    373.50
2131  2019_Q4    375.10
2132  2020_Q1    372.80
2133  2020_Q2    346.00

[2134 rows x 2 columns]


In [42]:
GDP_by_industry[3:10]

Unnamed: 0,period,Industry,GDP
3,2015_Q4,All industries,32298.7
4,2016_Q1,All industries,32303.8
5,2016_Q2,All industries,32696.4
6,2016_Q3,All industries,33070.7
7,2016_Q4,All industries,33457.8
8,2017_Q1,All industries,33984.6
9,2017_Q2,All industries,34167.9


## Exercise 5.03 - interger row numbers versus labels

In [43]:
sales_data=pd.DataFrame({'date':['2017-03-31','2017-06-30','2017-09-30', '2017-12-31', '2018-03-30', '2018-06-30', '2018-09-30', '2018-12-31','2019-03-31','2019-06-30', '2019-09-30','2019-12-31'], 'sales': [199190.4, 194356.6, 191611.7, 198918.9, 200163.2, 201510.2, 209749.8, 201897.8, 200098.8, 219340.3, 211542.5, 211729.1]})
sales_data

Unnamed: 0,date,sales
0,2017-03-31,199190.4
1,2017-06-30,194356.6
2,2017-09-30,191611.7
3,2017-12-31,198918.9
4,2018-03-30,200163.2
5,2018-06-30,201510.2
6,2018-09-30,209749.8
7,2018-12-31,201897.8
8,2019-03-31,200098.8
9,2019-06-30,219340.3


In [44]:
sales_data['month']=[sales_data.loc[i,'date'][5:7] for i in sales_data.index]
sales_data.set_index('month', drop=True, inplace=True)
sales_data

Unnamed: 0_level_0,date,sales
month,Unnamed: 1_level_1,Unnamed: 2_level_1
3,2017-03-31,199190.4
6,2017-06-30,194356.6
9,2017-09-30,191611.7
12,2017-12-31,198918.9
3,2018-03-30,200163.2
6,2018-06-30,201510.2
9,2018-09-30,209749.8
12,2018-12-31,201897.8
3,2019-03-31,200098.8
6,2019-06-30,219340.3


In [45]:
print('using .iloc with index 3: ', sales_data.iloc[3:])
print('using .loc with index 03: ', sales_data.loc['03',:])

using .iloc with index 3:               date      sales
month                       
12     2017-12-31 198,918.90
03     2018-03-30 200,163.20
06     2018-06-30 201,510.20
09     2018-09-30 209,749.80
12     2018-12-31 201,897.80
03     2019-03-31 200,098.80
06     2019-06-30 219,340.30
09     2019-09-30 211,542.50
12     2019-12-31 211,729.10
using .loc with index 03:               date      sales
month                       
03     2017-03-31 199,190.40
03     2018-03-30 200,163.20
03     2019-03-31 200,098.80


In [46]:
sales_data.groupby('month').mean()

Unnamed: 0_level_0,sales
month,Unnamed: 1_level_1
3,199817.47
6,205069.03
9,204301.33
12,204181.93


## Using Extended Indexing

In [47]:
GDP_by_industry[0:50:3]

Unnamed: 0,period,Industry,GDP
0,2015_Q1,All industries,31917.8
3,2015_Q4,All industries,32298.7
6,2016_Q3,All industries,33070.7
9,2017_Q2,All industries,34167.9
12,2018_Q1,All industries,35838.6
15,2018_Q4,All industries,37205.3
18,2019_Q3,All industries,37991.1
21,2020_Q2,All industries,34260.0
24,2015_Q3,Private industries,28826.0
27,2016_Q2,Private industries,29058.3


In [48]:
GDP_by_industry[::-1]

Unnamed: 0,period,Industry,GDP
2133,2020_Q2,Government enterprises,346.00
2132,2020_Q1,Government enterprises,372.80
2131,2019_Q4,Government enterprises,375.10
2130,2019_Q3,Government enterprises,373.50
2129,2019_Q2,Government enterprises,371.40
...,...,...,...
4,2016_Q1,All industries,32303.80
3,2015_Q4,All industries,32298.70
2,2015_Q3,All industries,32406.60
1,2015_Q2,All industries,32266.20


In [49]:
GDP_by_industry[100:50:-3]

Unnamed: 0,period,Industry,GDP
100,2018_Q1,"Forestry, fishing, and related activities",56.0
97,2017_Q2,"Forestry, fishing, and related activities",55.7
94,2016_Q3,"Forestry, fishing, and related activities",51.8
91,2015_Q4,"Forestry, fishing, and related activities",52.8
88,2015_Q1,"Forestry, fishing, and related activities",54.6
85,2019_Q4,Farms,405.9
82,2019_Q1,Farms,392.3
79,2018_Q2,Farms,405.0
76,2017_Q3,Farms,395.8
73,2016_Q4,Farms,375.1


### Type Exceptions

In [50]:
print(type(GDP_by_industry[['period','GDP']].iloc[0,:]),'\n', GDP_by_industry[['period','GDP']].iloc[0,:])

<class 'pandas.core.series.Series'> 
 period     2015_Q1
GDP      31,917.80
Name: 0, dtype: object


In [51]:
new_df = GDP_by_industry[0:1]
print(type(new_df),'\n',new_df,'\n')

<class 'pandas.core.frame.DataFrame'> 
     period        Industry       GDP
0  2015_Q1  All industries 31,917.80 



In [52]:
print(type(new_df[['period','GDP']]),'\n',new_df[['period','GDP']],'\n')

<class 'pandas.core.frame.DataFrame'> 
     period       GDP
0  2015_Q1 31,917.80 



In [53]:
print(type(new_df['GDP']),'\n',new_df['GDP'],'\n')
print(type(new_df.GDP),'\n', new_df.GDP,'\n')

<class 'pandas.core.series.Series'> 
 0   31,917.80
Name: GDP, dtype: float64 

<class 'pandas.core.series.Series'> 
 0   31,917.80
Name: GDP, dtype: float64 



In [54]:
print(type(GDP_by_industry.iloc[0,:].GDP),'\n', GDP_by_industry.iloc[0,:].GDP)

<class 'numpy.float64'> 
 31917.8


In [55]:
print(type(GDP_by_industry[0:1:1][['GDP','period']]))
print(type(GDP_by_industry[0:1:1][['GDP']]))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


## Changing DataFrame values using bracket or dot notation

In [56]:
GDP_2015=GDP_by_industry.loc[( (GDP_by_industry['period']=='2015_Q1') | (GDP_by_industry['period']=='2015_Q2') | (GDP_by_industry['period']=='2015_Q3') | (GDP_by_industry['period']=='2015_Q4')),['period','Industry','GDP']]
GDP_2015['GDP']=GDP_2015['GDP']+5000
GDP_2015

Unnamed: 0,period,Industry,GDP
0,2015_Q1,All industries,36917.80
1,2015_Q2,All industries,37266.20
2,2015_Q3,All industries,37406.60
3,2015_Q4,All industries,37298.70
22,2015_Q1,Private industries,33392.60
...,...,...,...
2093,2015_Q4,General government,7164.70
2112,2015_Q1,Government enterprises,5324.40
2113,2015_Q2,Government enterprises,5326.40
2114,2015_Q3,Government enterprises,5328.70


In [57]:
# This is supposed to not work
#GDP_2015[GDP_2015['GDP']>25000]['GDP']=0

In [58]:
GDP_2015.loc[GDP_2015.GDP>25000,'GDP']=0

In [59]:
GDP_2015

Unnamed: 0,period,Industry,GDP
0,2015_Q1,All industries,0.00
1,2015_Q2,All industries,0.00
2,2015_Q3,All industries,0.00
3,2015_Q4,All industries,0.00
22,2015_Q1,Private industries,0.00
...,...,...,...
2093,2015_Q4,General government,7164.70
2112,2015_Q1,Government enterprises,5324.40
2113,2015_Q2,Government enterprises,5326.40
2114,2015_Q3,Government enterprises,5328.70


## Exercise 5.04 - selecting data using bracket and dot notation

In [60]:
soybean_diseases=pd.read_csv('Chapter5-Datasets/soybean.csv')
soybean_diseases

Unnamed: 0,condition,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,...,int-discolor,sclerotia,fruit-pods,fruitspots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots
0,diaporthe-stem-canker,6.00,0.00,2.00,1.00,0.00,1.00,1.00,1.00,0.00,...,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00
1,diaporthe-stem-canker,4.00,0.00,2.00,1.00,0.00,2.00,0.00,2.00,1.00,...,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00
2,diaporthe-stem-canker,3.00,0.00,2.00,1.00,0.00,1.00,0.00,2.00,1.00,...,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00
3,diaporthe-stem-canker,3.00,0.00,2.00,1.00,0.00,1.00,0.00,2.00,0.00,...,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00
4,diaporthe-stem-canker,6.00,0.00,2.00,1.00,0.00,2.00,0.00,1.00,0.00,...,0.00,0.00,0.00,4.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,2-4-d-injury,,,,,,,,,,...,,,,,,,,,,1.00
303,herbicide-injury,1.00,1.00,,0.00,,1.00,0.00,,,...,,,3.00,,,,,,,1.00
304,herbicide-injury,0.00,1.00,,0.00,,0.00,3.00,,,...,,,3.00,,,,,,,1.00
305,herbicide-injury,1.00,1.00,,0.00,,0.00,0.00,,,...,,,3.00,,,,,,,1.00


In [61]:
print('there are', len(soybean_diseases['condition'].unique()), 'unique conditions', [soybean_diseases['condition'].unique() [i] for i in range(len(soybean_diseases['condition'].unique()))])

there are 19 unique conditions ['diaporthe-stem-canker', 'charcoal-rot', 'rhizoctonia-root-rot', 'phytophthora-rot', 'brown-stem-rot', 'powdery-mildew', 'downy-mildew', 'brown-spot', 'bacterial-blight', 'bacterial-pustule', 'purple-seed-stain', 'anthracnose', 'phyllosticta-leaf-spot', 'alternarialeaf-spot', 'frog-eye-leaf-spot', 'diaporthe-pod-&-stem-blight', 'cyst-nematode', '2-4-d-injury', 'herbicide-injury']


In [62]:
brown_spots = soybean_diseases.loc[soybean_diseases['condition']== 'brown-spot',:]
print('there are', brown_spots.shape[0], 'instances having brown-spot')

there are 40 instances having brown-spot


In [63]:
hail_related = soybean_diseases.loc[soybean_diseases['hail']==1, 'condition'].unique()
print('there are', len(hail_related),'conditions associated with hail damamge out of', len(soybean_diseases['condition'].unique()), 'total conditions')

there are 14 conditions associated with hail damamge out of 19 total conditions


In [64]:
hail_cases = soybean_diseases.loc[soybean_diseases['hail']==1,:]
print('there are', hail_cases.shape[0],'hail-related case out of', soybean_diseases.shape[0],'total cases')

there are 55 hail-related case out of 307 total cases


In [65]:
hail_cases.index

Int64Index([  7,  11,  13,  15,  16,  19,  22,  30,  43,  48,  55,  76,  77,
             80,  90,  91,  95, 100, 101, 102, 104, 110, 114, 116, 127, 128,
            133, 137, 142, 150, 151, 155, 157, 159, 161, 164, 166, 168, 170,
            172, 173, 176, 177, 181, 183, 185, 190, 195, 197, 201, 203, 205,
            206, 208, 214],
           dtype='int64')

In [66]:
for i in hail_cases.index:
    if hail_cases.loc[i, 'severity']==2:
        print('case',i,'with condition', hail_cases.loc[i,'condition'], 'is severe')

case 22 with condition rhizoctonia-root-rot is severe
case 43 with condition phytophthora-rot is severe
case 48 with condition phytophthora-rot is severe
case 55 with condition phytophthora-rot is severe
