# Chapter 6 : Data Selection - Series

In [1]:
import pandas as pd

##  Introduction to pandas Series

### The Series Index

In [2]:
income = pd.Series([100,125,105,111,275,137,99,10,250,100,175,200], index=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'], name='income')
income

Jan    100
Feb    125
Mar    105
Apr    111
May    275
Jun    137
Jul     99
Aug     10
Sep    250
Oct    100
Nov    175
Dec    200
Name: income, dtype: int64

In [3]:
income.reset_index(drop=True)

0     100
1     125
2     105
3     111
4     275
5     137
6      99
7      10
8     250
9     100
10    175
11    200
Name: income, dtype: int64

## Data Selection in a pandas Series

### Brackets, dots, Series.loc, and Series.iloc

In [5]:
UK_energy = pd.read_csv('Chapter6-Datasets/UK_energy.csv', index_col=None, usecols=[1]).squeeze("columns")
print(type(UK_energy))
print(UK_energy.head())

<class 'pandas.core.series.Series'>
0    288.177459
1    316.485721
2    338.565899
3    336.866984
4    332.844765
Name: annual_cost, dtype: float64


In [6]:
print('UK_energy.loc[[2,4,6]]\n\n', UK_energy.loc[[2,4,6]])

UK_energy.loc[[2,4,6]]

 2    338.565899
4    332.844765
6    341.909881
Name: annual_cost, dtype: float64


In [7]:
print('UK_energy[2:7:2]\n\n', UK_energy[2:7:2])

UK_energy[2:7:2]

 2    338.565899
4    332.844765
6    341.909881
Name: annual_cost, dtype: float64


In [8]:
print('UK_energy[[2,4,6]]\n\n', UK_energy[[2,4,6]])

UK_energy[[2,4,6]]

 2    338.565899
4    332.844765
6    341.909881
Name: annual_cost, dtype: float64


In [9]:
print('UK_energy.iloc[[,2,4,6]]\n\n', UK_energy.iloc[[2,4,6]])

UK_energy[[,2,4,6]]

 2    338.565899
4    332.844765
6    341.909881
Name: annual_cost, dtype: float64


In [10]:
print('UK_energy.iloc[[2:7:2]]\n\n', UK_energy.iloc[2:7:2])

UK_energy.iloc[[2:7:2]]

 2    338.565899
4    332.844765
6    341.909881
Name: annual_cost, dtype: float64


In [11]:
UK_energy.index = ['year_'+str(i) for i in range(1990,2020)]
UK_energy.index

Index(['year_1990', 'year_1991', 'year_1992', 'year_1993', 'year_1994',
       'year_1995', 'year_1996', 'year_1997', 'year_1998', 'year_1999',
       'year_2000', 'year_2001', 'year_2002', 'year_2003', 'year_2004',
       'year_2005', 'year_2006', 'year_2007', 'year_2008', 'year_2009',
       'year_2010', 'year_2011', 'year_2012', 'year_2013', 'year_2014',
       'year_2015', 'year_2016', 'year_2017', 'year_2018', 'year_2019'],
      dtype='object')

In [12]:
UK_energy.year_1997

326.4184542

In [13]:
UK_energy['year_1997':'year_2011']

year_1997    326.418454
year_1998    306.393163
year_1999    295.687501
year_2000    290.333333
year_2001    283.333333
year_2002    281.666667
year_2003    283.666667
year_2004    291.666667
year_2005    323.666667
year_2006    382.000000
year_2007    423.111111
year_2008    487.333333
year_2009    498.666667
year_2010    484.000000
year_2011    523.181818
Name: annual_cost, dtype: float64

## Exercise 6.01 - basic Series data selection

In [15]:
BOLD = pd.read_csv('Chapter6-Datasets/PLOS_BOLD_S1_patient_1.csv').squeeze('columns')
BOLD

0      0.783670
1      0.293040
2      0.111169
3     -0.169703
4     -0.147029
         ...   
139    0.723983
140    0.687518
141    0.515671
142    0.432008
143    0.146747
Name: Y, Length: 144, dtype: float64

In [16]:
BOLD.index = range(0, 2*len(BOLD),2)
BOLD

0      0.783670
2      0.293040
4      0.111169
6     -0.169703
8     -0.147029
         ...   
278    0.723983
280    0.687518
282    0.515671
284    0.432008
286    0.146747
Name: Y, Length: 144, dtype: float64

In [17]:
B2=BOLD[::2]
B2

0      0.783670
4      0.111169
8     -0.147029
12    -0.032271
16    -0.202202
         ...   
268   -0.014538
272    0.180167
276    0.382172
280    0.687518
284    0.432008
Name: Y, Length: 72, dtype: float64

In [18]:
B2[len(B2):(len(B2)-10):-1]

284    0.432008
280    0.687518
276    0.382172
272    0.180167
268   -0.014538
264   -0.080900
260    0.069567
256    0.153728
252    0.220703
Name: Y, dtype: float64

## Prepareing Series from DataFrames and vice versa

In [19]:
water_data = pd.read_csv('Chapter6-Datasets/water-treatment.csv')
water_data.fillna(-9999, inplace=True)
water_data

Unnamed: 0,date,input_flow,input_Zinc,input_pH,input_BOD,input_COD,input_SS,input_VSS,input_SED,input_CON,...,output_COND,RD-DBO-P,RD-SS-P,RD-SED-P,RD-DBO-S,RD-DQO-S,RD-DBO-G,RD-DQO-G,RD-SS-G,RD-SED-G
0,1/1/1990,41230.0,0.35,7.6,120.0,344.0,136.0,54.4,4.5,993,...,903.0,-9999.0,62.8,93.3,-9999.0,62.5,86.7,71.8,87.5,99.4
1,1/2/1990,37386.0,1.40,7.9,165.0,470.0,170.0,76.5,4.0,1365,...,1481.0,-9999.0,50.0,94.4,85.9,73.6,86.7,79.4,89.4,100.0
2,1/3/1990,34535.0,1.00,7.8,232.0,518.0,220.0,65.5,5.5,1617,...,1492.0,32.6,62.4,95.0,81.3,59.9,87.5,71.8,85.9,99.8
3,1/4/1990,32527.0,3.00,7.8,187.0,460.0,180.0,67.8,5.2,1832,...,1590.0,13.2,57.6,95.5,85.3,70.4,85.0,77.2,83.3,100.0
4,1/7/1990,27760.0,1.20,7.6,199.0,466.0,186.0,74.2,4.5,1220,...,1411.0,38.2,46.6,95.0,84.9,61.1,89.4,73.8,86.6,99.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,10/25/1991,35400.0,0.70,7.6,156.0,364.0,194.0,63.9,5.5,1680,...,1840.0,47.3,61.3,94.0,76.4,-9999.0,86.5,82.4,90.7,99.8
523,10/26/1991,30964.0,3.30,7.7,220.0,540.0,184.0,62.0,3.5,1445,...,1337.0,-9999.0,38.6,93.3,-9999.0,87.0,92.7,95.0,91.8,95.7
524,10/27/1991,35573.0,7.30,7.6,176.0,333.0,178.0,64.0,3.5,1627,...,1799.0,-9999.0,40.4,95.0,-9999.0,72.9,90.9,79.9,-9999.0,98.6
525,10/29/1991,29801.0,1.60,7.7,172.0,400.0,136.0,70.1,1.5,1402,...,1468.0,32.4,40.4,88.0,87.8,77.6,91.3,85.3,83.8,96.7


In [20]:
water_data.set_index('date', drop=True, inplace = True)

In [21]:
type(water_data['input_flow'])

pandas.core.series.Series

In [22]:
acidity = water_data.loc[water_data['input_pH']<7.5,:]
pH = acidity['input_pH']

In [23]:
pH_data=pd.DataFrame({'pH':pH})
pH_data.head()

Unnamed: 0_level_0,pH
date,Unnamed: 1_level_1
3/20/1990,7.4
4/13/1990,7.2
6/4/1990,7.3
6/8/1990,7.4
7/1/1990,7.3


In [24]:
pH_data.reset_index(drop=True, inplace = True)

In [29]:
flow_hypothesis = pd.DataFrame({'pH': pH_data['pH'],'flow': acidity['input_flow']})
print(flow_hypothesis.head())
print(flow_hypothesis.tail())

    pH  flow
0  7.4   NaN
1  7.2   NaN
2  7.3   NaN
3  7.4   NaN
4  7.3   NaN
           pH     flow
8/21/1990 NaN  34352.0
8/24/1990 NaN  32802.0
8/28/1991 NaN  32922.0
8/29/1991 NaN  32190.0
8/4/1991  NaN  24978.0


In [31]:
print(list(pH_data.index))
print(acidity.index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]
Index(['3/20/1990', '4/13/1990', '6/4/1990', '6/8/1990', '7/1/1990',
       '7/23/1990', '7/29/1990', '8/21/1990', '8/24/1990', '10/7/1990',
       '3/26/1991', '4/12/1991', '5/9/1991', '5/23/1991', '6/14/1991',
       '6/24/1991', '7/1/1991', '7/5/1991', '7/19/1991', '7/21/1991',
       '7/30/1991', '8/1/1991', '8/4/1991', '8/18/1991', '8/28/1991',
       '8/29/1991', '10/5/1991'],
      dtype='object', name='date')


In [32]:
acidity.set_index(pH_data.index, drop = True, inplace = True)
flow_hypothesis=pd.DataFrame({'pH':pH_data['pH'], 'flow': acidity['input_flow']})
print(flow_hypothesis.head())
print(flow_hypothesis.tail())

    pH     flow
0  7.4  39165.0
1  7.2  34667.0
2  7.3  51520.0
3  7.4  35789.0
4  7.3  30201.0
     pH     flow
22  7.3  24978.0
23  7.3  27527.0
24  7.4  32922.0
25  7.3  32190.0
26  7.3  33695.0


In [35]:
acidity = water_data.loc[water_data['input_pH']<7.5, :]
pH = acidity['input_pH']
flow = acidity['input_flow']
flow_hypothesis = pd.DataFrame({'pH':pH,'flow': flow})
print(flow_hypothesis.head())
print(flow_hypothesis.tail())

            pH     flow
date                   
3/20/1990  7.4  39165.0
4/13/1990  7.2  34667.0
6/4/1990   7.3  51520.0
6/8/1990   7.4  35789.0
7/1/1990   7.3  30201.0
            pH     flow
date                   
8/4/1991   7.3  24978.0
8/18/1991  7.3  27527.0
8/28/1991  7.4  32922.0
8/29/1991  7.3  32190.0
10/5/1991  7.3  33695.0
