In [1]:
import pandas as pd
counts = pd.Series([632, 1638,456,112])
print(counts)

0     632
1    1638
2     456
3     112
dtype: int64


In [2]:
counts.values

array([ 632, 1638,  456,  112], dtype=int64)

In [3]:
counts.index

RangeIndex(start=0, stop=4, step=1)

In [4]:
bacteria = pd.Series([632, 1638,569,115],
                index=['Firmicutes','Proteobacteria', 'Actinobacteria','Bacteroidetes'])
print(bacteria)

Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
dtype: int64


In [5]:
bacteria['Actinobacteria']

569

In [6]:
bacteria[bacteria.index.str.endswith('bacteria')]

Proteobacteria    1638
Actinobacteria     569
dtype: int64

In [7]:
'Bacteroidetes' in bacteria

True

In [8]:
bacteria[0]

632

In [9]:
bacteria.name = 'counts'
bacteria.index.name = 'phylum'
print(bacteria)

phylum
Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
Name: counts, dtype: int64


In [10]:
import numpy as np
np.log(bacteria)

phylum
Firmicutes        6.448889
Proteobacteria    7.401231
Actinobacteria    6.343880
Bacteroidetes     4.744932
Name: counts, dtype: float64

In [11]:
bacteria[bacteria>1000]

phylum
Proteobacteria    1638
Name: counts, dtype: int64

In [12]:
bacteria_dict = {'Firmicutes': 632,'Proteobacteria': 1632, 'Actinobacteria': 569,
                 'Bacteroidetes': 115}

In [13]:
bact = pd.Series(bacteria_dict)
print(bact)

Firmicutes         632
Proteobacteria    1632
Actinobacteria     569
Bacteroidetes      115
dtype: int64


In [14]:
bacteria2 = pd.Series(bacteria_dict,
                     index=['Cyanobacteria','Firmicutes','Proteobacteria','Actinobacteria'])
print(bacteria2)

Cyanobacteria        NaN
Firmicutes         632.0
Proteobacteria    1632.0
Actinobacteria     569.0
dtype: float64


In [15]:
bacteria2.isnull()

Cyanobacteria      True
Firmicutes        False
Proteobacteria    False
Actinobacteria    False
dtype: bool

In [16]:
bacteria + bacteria2

Actinobacteria    1138.0
Bacteroidetes        NaN
Cyanobacteria        NaN
Firmicutes        1264.0
Proteobacteria    3270.0
dtype: float64

In [17]:
bacteria_data = pd.DataFrame({'value':[632, 1638, 569,115,433,1130,754,555],
                            'patient':[1,1,1,1,2,2,2,2],
                            'phylum':['Firmicutes', 'Proteobacteria', 'Actinobacteria',
                                     'Bacteroidetes', 'Firmicutes','Proteobacteria',
                                     'Actinobacteria', 'Bacteroidetes']})
bacteria_data

Unnamed: 0,value,patient,phylum
0,632,1,Firmicutes
1,1638,1,Proteobacteria
2,569,1,Actinobacteria
3,115,1,Bacteroidetes
4,433,2,Firmicutes
5,1130,2,Proteobacteria
6,754,2,Actinobacteria
7,555,2,Bacteroidetes


In [18]:
bacteria_data[['phylum','value','patient']]

Unnamed: 0,phylum,value,patient
0,Firmicutes,632,1
1,Proteobacteria,1638,1
2,Actinobacteria,569,1
3,Bacteroidetes,115,1
4,Firmicutes,433,2
5,Proteobacteria,1130,2
6,Actinobacteria,754,2
7,Bacteroidetes,555,2


In [19]:
bacteria_data.columns

Index(['value', 'patient', 'phylum'], dtype='object')

In [20]:
bacteria_data['value']

0     632
1    1638
2     569
3     115
4     433
5    1130
6     754
7     555
Name: value, dtype: int64

In [21]:
type(bacteria_data['value'])

pandas.core.series.Series

In [22]:
bacteria_data['value']

0     632
1    1638
2     569
3     115
4     433
5    1130
6     754
7     555
Name: value, dtype: int64

In [23]:
bacteria_data.loc[3]

value                115
patient                1
phylum     Bacteroidetes
Name: 3, dtype: object

In [24]:
bacteria_data[['value']]

Unnamed: 0,value
0,632
1,1638
2,569
3,115
4,433
5,1130
6,754
7,555


In [25]:
bacteria_data.head()

Unnamed: 0,value,patient,phylum
0,632,1,Firmicutes
1,1638,1,Proteobacteria
2,569,1,Actinobacteria
3,115,1,Bacteroidetes
4,433,2,Firmicutes


In [26]:
bacteria_data.tail(3)

Unnamed: 0,value,patient,phylum
5,1130,2,Proteobacteria
6,754,2,Actinobacteria
7,555,2,Bacteroidetes


In [27]:
bacteria_data.shape

(8, 3)

In [28]:
bacteria_data = pd.DataFrame([{'patient':1,'phylum': 'Firmicutes','value':632},
                              {'patient':1,'phylum': 'Proteobacteria','value':1638},
                              {'patient':1,'phylum': 'Actinobacteria','value':569}, 
                              {'patient':1,'phylum': 'Bacteroidetes','value':115},  
                              {'patient':2,'phylum': 'Firmicutes','value':433},
                              {'patient':2,'phylum': 'Proteobacteria','value':1130},
                              {'patient':2,'phylum': 'Bacteroidetes','value':754},  
                              {'patient':2,'phylum': 'Firmicutes','value':555}])

In [29]:
bacteria_data

Unnamed: 0,patient,phylum,value
0,1,Firmicutes,632
1,1,Proteobacteria,1638
2,1,Actinobacteria,569
3,1,Bacteroidetes,115
4,2,Firmicutes,433
5,2,Proteobacteria,1130
6,2,Bacteroidetes,754
7,2,Firmicutes,555


In [30]:
vals = bacteria_data.value
vals

0     632
1    1638
2     569
3     115
4     433
5    1130
6     754
7     555
Name: value, dtype: int64

In [31]:
vals[5] = 0
vals

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0     632
1    1638
2     569
3     115
4     433
5       0
6     754
7     555
Name: value, dtype: int64

In [32]:
bacteria_data

Unnamed: 0,patient,phylum,value
0,1,Firmicutes,632
1,1,Proteobacteria,1638
2,1,Actinobacteria,569
3,1,Bacteroidetes,115
4,2,Firmicutes,433
5,2,Proteobacteria,0
6,2,Bacteroidetes,754
7,2,Firmicutes,555


In [33]:
vals = bacteria_data.value.copy()
vals[5] = 1000

In [34]:
bacteria_data

Unnamed: 0,patient,phylum,value
0,1,Firmicutes,632
1,1,Proteobacteria,1638
2,1,Actinobacteria,569
3,1,Bacteroidetes,115
4,2,Firmicutes,433
5,2,Proteobacteria,0
6,2,Bacteroidetes,754
7,2,Firmicutes,555


In [35]:
bacteria_data.value[5] = 1130

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [36]:
bacteria_data

Unnamed: 0,patient,phylum,value
0,1,Firmicutes,632
1,1,Proteobacteria,1638
2,1,Actinobacteria,569
3,1,Bacteroidetes,115
4,2,Firmicutes,433
5,2,Proteobacteria,1130
6,2,Bacteroidetes,754
7,2,Firmicutes,555


In [37]:
bacteria_data['year'] = 2013
bacteria_data

Unnamed: 0,patient,phylum,value,year
0,1,Firmicutes,632,2013
1,1,Proteobacteria,1638,2013
2,1,Actinobacteria,569,2013
3,1,Bacteroidetes,115,2013
4,2,Firmicutes,433,2013
5,2,Proteobacteria,1130,2013
6,2,Bacteroidetes,754,2013
7,2,Firmicutes,555,2013


In [38]:
bacteria_data.phylum

0        Firmicutes
1    Proteobacteria
2    Actinobacteria
3     Bacteroidetes
4        Firmicutes
5    Proteobacteria
6     Bacteroidetes
7        Firmicutes
Name: phylum, dtype: object

In [39]:
treatment = pd.Series([0]*4 + [1]*2)
treatment

0    0
1    0
2    0
3    0
4    1
5    1
dtype: int64

In [40]:
bacteria_data['treatment'] = treatment
bacteria_data

Unnamed: 0,patient,phylum,value,year,treatment
0,1,Firmicutes,632,2013,0.0
1,1,Proteobacteria,1638,2013,0.0
2,1,Actinobacteria,569,2013,0.0
3,1,Bacteroidetes,115,2013,0.0
4,2,Firmicutes,433,2013,1.0
5,2,Proteobacteria,1130,2013,1.0
6,2,Bacteroidetes,754,2013,
7,2,Firmicutes,555,2013,


In [41]:
bacteria_data.values

array([[1, 'Firmicutes', 632, 2013, 0.0],
       [1, 'Proteobacteria', 1638, 2013, 0.0],
       [1, 'Actinobacteria', 569, 2013, 0.0],
       [1, 'Bacteroidetes', 115, 2013, 0.0],
       [2, 'Firmicutes', 433, 2013, 1.0],
       [2, 'Proteobacteria', 1130, 2013, 1.0],
       [2, 'Bacteroidetes', 754, 2013, nan],
       [2, 'Firmicutes', 555, 2013, nan]], dtype=object)

In [42]:
df = pd.DataFrame({'foo':[1,2,3], 'bar':[0.4, -1.0, 4.5]})
df.values, df.values.dtype

(array([[ 1. ,  0.4],
        [ 2. , -1. ],
        [ 3. ,  4.5]]), dtype('float64'))

In [43]:
bacteria_data.index[0] = 15

TypeError: Index does not support mutable operations

In [44]:
bacteria2.index = bacteria.index
bacteria2

phylum
Firmicutes           NaN
Proteobacteria     632.0
Actinobacteria    1632.0
Bacteroidetes      569.0
dtype: float64

In [59]:
#with open('olympics.2018.txt') as f:
#    medals = pd.read_table(f, sep='\t', index_col = 0, header = None, lineterminator='\n')
medals = pd.read_table('olympics.2018.txt', sep='\t', index_col=0, header=None, names=['country', 'medals','population'])
medals.head()

Unnamed: 0_level_0,medals,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Tonga,1,96165
Bahamas,1,281584
Jamaica,6,2589043
Cuba,25,10952046
Australia,41,18348078


In [46]:
oecd_site = 'http://www.oecd.org/about/membersandpartners/list-oecd-member-countries.htm'

In [47]:
pd.read_html(oecd_site)

[                                                   0
 0  On 14 December 1960, 20Â countries originally s...
 1  Here is a list of the current Member countries...,
                                      0                1                  2   3
 0                                  NaN          Country               Date NaN
 1                                  NaN        AUSTRALIA        7 June 1971 NaN
 2                                  NaN          AUSTRIA  29 September 1961 NaN
 3                                  NaN          BELGIUM  13 September 1961 NaN
 4                                  NaN           CANADA      10 April 1961 NaN
 5                                  NaN            CHILE         7 May 2010 NaN
 6                                  NaN   CZECH REPUBLIC   21 December 1995 NaN
 7                                  NaN          DENMARK        30 May 1961 NaN
 8                                  NaN          ESTONIA    9 December 2010 NaN
 9                                  

In [48]:
oecd = pd.read_html(oecd_site, header=0)[1][['Country', 'Date']]
oecd.head()

Unnamed: 0,Country,Date
0,AUSTRALIA,7 June 1971
1,AUSTRIA,29 September 1961
2,BELGIUM,13 September 1961
3,CANADA,10 April 1961
4,CHILE,7 May 2010


In [49]:
oecd['year'] = pd.to_datetime(oecd.Date).apply(lambda x: x.year)
oecd_year = oecd.set_index(oecd.Country.str.title())['year'].dropna()
oecd_year

Country
Australia          1971.0
Austria            1961.0
Belgium            1961.0
Canada             1961.0
Chile              2010.0
Czech Republic     1995.0
Denmark            1961.0
Estonia            2010.0
Finland            1969.0
France             1961.0
Germany            1961.0
Greece             1961.0
Hungary            1996.0
Iceland            1961.0
Ireland            1961.0
Israel             2010.0
Italy              1962.0
Japan              1964.0
Korea              1996.0
Latvia             2016.0
Lithuania          2018.0
Luxembourg         1961.0
Mexico             1994.0
Netherlands        1961.0
New Zealand        1973.0
Norway             1961.0
Poland             1996.0
Portugal           1961.0
Slovak Republic    2000.0
Slovenia           2010.0
Spain              1961.0
Sweden             1961.0
Switzerland        1961.0
Turkey             1961.0
United Kingdom     1961.0
United States      1961.0
Name: year, dtype: float64

In [61]:
medals_data = medals.assign(oecd=medals.index.isin((oecd_year[oecd_year<1997]).index).astype(int))

In [62]:
medals_data = medals_data.assign(log_population=np.log(medals.population))

In [63]:
medals_data.head()

Unnamed: 0_level_0,medals,population,oecd,log_population
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tonga,1,96165,0,11.473821
Bahamas,1,281584,0,12.548186
Jamaica,6,2589043,0,14.766799
Cuba,25,10952046,0,16.209037
Australia,41,18348078,1,16.725035


In [64]:
mb = pd.read_csv("microbiome.csv")

In [65]:
pd.read_csv("microbiome.csv", skiprows=[3,4,6]).head()

Unnamed: 0,Taxon,Patient,Tissue,Stool
0,Firmicutes,1,632,305
1,Firmicutes,2,136,4182
2,Firmicutes,5,831,8605
3,Firmicutes,7,718,717
4,Firmicutes,8,173,33


In [66]:
few_recs = pd.read_csv("microbiome.csv", nrows=4)

In [67]:
few_recs

Unnamed: 0,Taxon,Patient,Tissue,Stool
0,Firmicutes,1,632,305
1,Firmicutes,2,136,4182
2,Firmicutes,3,1174,703
3,Firmicutes,4,408,3946


In [68]:
data_chunks = pd.read_csv("microbiome.csv", chunksize=5)
data_chunks

<pandas.io.parsers.TextFileReader at 0x17210ae96a0>

In [69]:
next(data_chunks)

Unnamed: 0,Taxon,Patient,Tissue,Stool
0,Firmicutes,1,632,305
1,Firmicutes,2,136,4182
2,Firmicutes,3,1174,703
3,Firmicutes,4,408,3946
4,Firmicutes,5,831,8605


In [70]:
next(data_chunks)

Unnamed: 0,Taxon,Patient,Tissue,Stool
5,Firmicutes,6,693,50
6,Firmicutes,7,718,717
7,Firmicutes,8,173,33
8,Firmicutes,9,228,80
9,Firmicutes,10,162,3196


In [71]:
mb=pd.read_csv("microbiome.csv", index_col=['Taxon','Patient'])
mb.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Tissue,Stool
Taxon,Patient,Unnamed: 2_level_1,Unnamed: 3_level_1
Firmicutes,1,632,305
Firmicutes,2,136,4182
Firmicutes,3,1174,703
Firmicutes,4,408,3946
Firmicutes,5,831,8605


In [72]:
mb.index

MultiIndex(levels=[['Firmicutes'], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]],
           labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]],
           names=['Taxon', 'Patient'])

In [73]:
mb.loc[('Firmicutes',2)]

Tissue     136
Stool     4182
Name: (Firmicutes, 2), dtype: int64

In [74]:
mb.xs(1,level='Patient')

Unnamed: 0_level_0,Tissue,Stool
Taxon,Unnamed: 1_level_1,Unnamed: 2_level_1
Firmicutes,632,305


In [75]:
mb.swaplevel('Patient', 'Taxon').head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Tissue,Stool
Patient,Taxon,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Firmicutes,632,305
2,Firmicutes,136,4182
3,Firmicutes,1174,703
4,Firmicutes,408,3946
5,Firmicutes,831,8605
