In [13]:
import pandas as pd
import numpy as np

In [2]:
a = pd.Series([0, 1, 4, 9, 16, 25], name='squares')

In [3]:
a

0     0
1     1
2     4
3     9
4    16
5    25
Name: squares, dtype: int64

In [4]:
a.values

array([ 0,  1,  4,  9, 16, 25])

In [7]:
a.index

RangeIndex(start=0, stop=6, step=1)

The indices of a Pandas series or data frame can be used to extract elements.

In [8]:
a[0]

0

In [9]:
a[2]

4

In [10]:
a[2:4]

2    4
3    9
Name: squares, dtype: int64

In [11]:
pop2014 = pd.Series([100,99.3,95.5,93.5,92.4,84.8,84.5,78.9,74.3,72.8],
                    index=['Java','C','C++','Python','C#','PHP','JavaScript','Ruby','R','Matlab'])

In [12]:
pop2015 = pd.Series({'Java': 100,'C': 99.9,'C++': 99.4,'Python': 96.5,'C#': 91.3,
                     'R': 84.8,'PHP': 84.5,'JavaScript': 83.0,'Ruby': 76.2,'Matlab': 72.4})

In [14]:
twoyears = pd.DataFrame({'2014': pop2014, '2015': pop2015})
twoyears

Unnamed: 0,2014,2015
C,99.3,99.9
C#,92.4,91.3
C++,95.5,99.4
Java,100.0,100.0
JavaScript,84.5,83.0
Matlab,72.8,72.4
PHP,84.8,84.5
Python,93.5,96.5
R,74.3,84.8
Ruby,78.9,76.2


In [16]:
twoyears = twoyears.sort_values('2015', ascending=False)
twoyears

Unnamed: 0,2014,2015
Java,100.0,100.0
C,99.3,99.9
C++,95.5,99.4
Python,93.5,96.5
C#,92.4,91.3
R,74.3,84.8
PHP,84.8,84.5
JavaScript,84.5,83.0
Ruby,78.9,76.2
Matlab,72.8,72.4


In [17]:
twoyears.values

array([[100. , 100. ],
       [ 99.3,  99.9],
       [ 95.5,  99.4],
       [ 93.5,  96.5],
       [ 92.4,  91.3],
       [ 74.3,  84.8],
       [ 84.8,  84.5],
       [ 84.5,  83. ],
       [ 78.9,  76.2],
       [ 72.8,  72.4]])

In [18]:
twoyears.index

Index(['Java', 'C', 'C++', 'Python', 'C#', 'R', 'PHP', 'JavaScript', 'Ruby',
       'Matlab'],
      dtype='object')

In [19]:
twoyears.columns

Index(['2014', '2015'], dtype='object')

Selecting a column from the data frame just requires calling the column in brackets. For a row, it is best to use loc or iloc.

In [20]:
twoyears['2015']

Java          100.0
C              99.9
C++            99.4
Python         96.5
C#             91.3
R              84.8
PHP            84.5
JavaScript     83.0
Ruby           76.2
Matlab         72.4
Name: 2015, dtype: float64

In [22]:
twoyears.iloc[0:2]

Unnamed: 0,2014,2015
Java,100.0,100.0
C,99.3,99.9


In [23]:
twoyears.loc['C':'Python']

Unnamed: 0,2014,2015
C,99.3,99.9
C++,95.5,99.4
Python,93.5,96.5


Creation of a new column from operations on other columns:

In [25]:
twoyears['avg'] = 0.5 * (twoyears['2014'] + twoyears['2015'])
twoyears

Unnamed: 0,2014,2015,avg
Java,100.0,100.0,100.0
C,99.3,99.9,99.6
C++,95.5,99.4,97.45
Python,93.5,96.5,95.0
C#,92.4,91.3,91.85
R,74.3,84.8,79.55
PHP,84.8,84.5,84.65
JavaScript,84.5,83.0,83.75
Ruby,78.9,76.2,77.55
Matlab,72.8,72.4,72.6


In [28]:
presidents= pd.DataFrame([{'name': 'Barack Obama', 'inaguration': 2009, 'birthyear': 1961},
                         {'name': 'George W Bush', 'inaguration': 2001, 'birthyear': 1946},
                         {'name': 'Bill Clinton', 'birthyear': 1946, 'inaguration': 1993},
                         {'name': 'George H W Bush', 'birthyear': 1924, 'inaguration': 1989}])

In [29]:
presidents

Unnamed: 0,name,inaguration,birthyear
0,Barack Obama,2009,1961
1,George W Bush,2001,1946
2,Bill Clinton,1993,1946
3,George H W Bush,1989,1924


We can choose one of the columns to serve as the indices. Let's choose the name column.

In [30]:
presidents_indexes = presidents.set_index('name')

In [31]:
presidents_indexes

Unnamed: 0_level_0,inaguration,birthyear
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barack Obama,2009,1961
George W Bush,2001,1946
Bill Clinton,1993,1946
George H W Bush,1989,1924


In [32]:
presidents_indexes.loc['Bill Clinton']

inaguration    1993
birthyear      1946
Name: Bill Clinton, dtype: int64

In [37]:
presidents_indexes.loc['Bill Clinton']['inaguration']

1993

presidents_indexes['inaguration']['Bill Clinton']

# Join operations

In [38]:
presidents_fathers = pd.DataFrame([{'son': 'Barack Obama', 'father': 'Barack Obama Sr.'},
                                   {'son': 'George W Bush', 'father': 'George H W Bush'},
                                   {'son': 'George H W Bush', 'father': 'Prescott Bush'},
                                   ])

In [39]:
pd.merge(presidents, presidents_fathers, left_on='name', right_on='son')

Unnamed: 0,name,inaguration,birthyear,son,father
0,Barack Obama,2009,1961,Barack Obama,Barack Obama Sr.
1,George W Bush,2001,1946,George W Bush,George H W Bush
2,George H W Bush,1989,1924,George H W Bush,Prescott Bush


In [40]:
pd.merge(presidents, presidents_fathers, left_on='name', right_on='son').drop('son', axis=1)

Unnamed: 0,name,inaguration,birthyear,father
0,Barack Obama,2009,1961,Barack Obama Sr.
1,George W Bush,2001,1946,George H W Bush
2,George H W Bush,1989,1924,Prescott Bush


The above type of join dropped Bill Clinton. We can include him with a null value if we specify _how=left_.

In [41]:
pd.merge(presidents, presidents_fathers, left_on='name', right_on='son', how='left').drop('son', axis=1)

Unnamed: 0,name,inaguration,birthyear,father
0,Barack Obama,2009,1961,Barack Obama Sr.
1,George W Bush,2001,1946,George H W Bush
2,Bill Clinton,1993,1946,
3,George H W Bush,1989,1924,Prescott Bush
