In [2]:
import pandas as pd

In [3]:
account_info = pd.DataFrame({ "name": ["Bob", "Mary", "Mita"], 
                               "account": [123846, 123972, 347209], 
                               "balance": [123, 3972, 7209], 
                               })

In [4]:
account_info['name']

0     Bob
1    Mary
2    Mita
Name: name, dtype: object

In [5]:
account_info["name"] = ["Smith", "Jane", "Patel"]

In [6]:
account_info

Unnamed: 0,name,account,balance
0,Smith,123846,123
1,Jane,123972,3972
2,Patel,347209,7209


Example of creating a sub-DataFrame

In [7]:
account_info = pd.DataFrame({ "name": ["Bob", "Mary", "Mita"], "account": [123846, 123972, 347209], "balance": [123, 3972, 7209], })

account_info[['name', 'balance']]

Unnamed: 0,name,balance
0,Bob,123
1,Mary,3972
2,Mita,7209


The iloc method

In [8]:
account_info = pd.DataFrame({ "name": ["Bob", "Mary", "Mita"], "account": [123846, 123972, 347209], "balance": [123, 3972, 7209], })

account_info.iloc[1]

name         Mary
account    123972
balance      3972
Name: 1, dtype: object

In [9]:
account_info.iloc[0:2]

Unnamed: 0,name,account,balance
0,Bob,123846,123
1,Mary,123972,3972


In [10]:
account_info.iloc[:]

Unnamed: 0,name,account,balance
0,Bob,123846,123
1,Mary,123972,3972
2,Mita,347209,7209


`iloc` is used to index a DataFrame via integer position-based indexing. The first position in the iloc function specifies the row indexes, while the second position specifies the column indexes. 

In [11]:
account_info.iloc[1,2]

3972

In [13]:
account_info.iloc[1,2] = 3975

In [14]:
account_info.iloc[1,2]

3975

In [15]:
account_info.iloc[:,[0,2]]

Unnamed: 0,name,balance
0,Bob,123
1,Mary,3975
2,Mita,7209


In [16]:
account_info = pd.DataFrame({ "name": ["Bob", "Mary", "Mita"], "account": [123846, 123972, 347209], "balance": [123, 3972, 7209], })

iloc also accepts a Boolean array. 

In [19]:
account_info.iloc[account_info.index % 2 == 1]

Unnamed: 0,name,account,balance
1,Mary,123972,3972


The loc method

loc is similar to iloc, but it allows you to index into a DataFrame via `column names or labels`. 

In [20]:
account_info = pd.DataFrame({ "name": ["Bob", "Mary", "Mita"], "account": [123846, 123972, 347209], "balance": [123, 3972, 7209], })

account_info.loc[1,'balance']

3972

In [21]:
account_info.loc[:, ['name', 'balance']]

Unnamed: 0,name,balance
0,Bob,123
1,Mary,3972
2,Mita,7209


Inner Merge

Inner merge is used when you want to find the intersection between two pandas DataFrames (Figure 2-1). In Listing 2-10, for example, we are trying to find the data that is present in both data sets or in this case the buildings that were standing in 1844 that are still standing today.

In [23]:
building_records_1844 = pd.DataFrame(
    [["Grande Hotel", 1830],
    ["Jone’s Farm", 1842],
    ["Public Library", 1836],
    ["Marietta House", 1823]],
    columns=["building", "established"],
).set_index(["building"])

print(building_records_1844)

                established
building                   
Grande Hotel           1830
Jone’s Farm            1842
Public Library         1836
Marietta House         1823


In [24]:
building_records_2020 = pd.DataFrame(
    [["Sam’s Bakery", 1962],
    ["Grande Hotel", 1830],
    ["Public Library", 1836],
    ["Mayberry’s Factory", 1924]],
    columns=["building", "established"],
).set_index(["building"])

print(building_records_2020)

                    established
building                       
Sam’s Bakery               1962
Grande Hotel               1830
Public Library             1836
Mayberry’s Factory         1924


In [27]:
cols = building_records_2020.columns.difference(building_records_1844.columns)
print(
    pd.merge(building_records_1844, building_records_2020[cols], how="inner", on=["building"])
)


                established
building                   
Grande Hotel           1830
Public Library         1836


Outer Merge

In Listing 2-11, we are merging two data sets of gene samplings together, meaning we want all the data from both in the same data set without duplication. We can achieve this by doing an outer merge

In [28]:
gene_group1 = pd.DataFrame(
    [["Myc", 2, 0.05],
    ["BRCA1", 3, 0.01],
    ["BRCA2", 8, 0.02]],
    columns=["id", "FC1", "P1"],
).set_index(["id"])

print(gene_group1)

       FC1    P1
id              
Myc      2  0.05
BRCA1    3  0.01
BRCA2    8  0.02


In [29]:
gene_group2 = pd.DataFrame(
    [["Myc", 2, 0.05],
    ["BRCA1", 3, 0.01],
    ["Notch1", 2, 0.03],
    ["BRCA2", 8, 0.02]],
    columns=["id", "FC2", "P2"],
).set_index(["id"])

print(gene_group2)

        FC2    P2
id               
Myc       2  0.05
BRCA1     3  0.01
Notch1    2  0.03
BRCA2     8  0.02


In [30]:
print(pd.merge(gene_group1, gene_group2, how="outer", on=["id"]))

        FC1    P1  FC2    P2
id                          
Myc     2.0  0.05    2  0.05
BRCA1   3.0  0.01    3  0.01
BRCA2   8.0  0.02    8  0.02
Notch1  NaN   NaN    2  0.03
