# Pandas DataFrame operations

This is a reference notebook for useful operations on python pandas package's dataframe data type. 



In [11]:
import pandas as pd

## Initialization

In [12]:
new_df = pd.DataFrame()

#### Read Excel, way 1
This is a little faster, than way2

In [13]:
excel_obj = pd.ExcelFile("/Users/korolo/data/cssp/fungal_synonyms/Amy_ICTF/pathogenic_fungi_ICTF.xlsx")
excel_df = excel_obj.parse('original_data')  # sheet name

#### Read Excel, way 2:
A little slower than way1

In [14]:
df = pd.read_excel('/Users/korolo/data/cssp/fungal_synonyms/Amy_ICTF/pathogenic_fungi_ICTF.xlsx', sheet_name='original_data')

#### Read json from URL, way 1
This is brittle, since it's using local solr url as an example

In [15]:
query_url = 'http://localhost:8983/solr/CFIA_all/select?fl=id&q=title:grain'

In [16]:
df2 = pd.read_json(query_url)
df2.head()

URLError: <urlopen error [Errno 61] Connection refused>

#### Read json from url, way 1

In [17]:
import requests

query_url = 'http://localhost:8983/solr/CFIA_all/select?fl=id&q=title:grain'
r = requests.get(query_url)
query_response_df = pd.DataFrame(r.json()['response']['docs'])
query_response_df.head()

ConnectionError: HTTPConnectionPool(host='localhost', port=8983): Max retries exceeded with url: /solr/CFIA_all/select?fl=id&q=title:grain (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x10dedcac8>: Failed to establish a new connection: [Errno 61] Connection refused',))

#### Read json from url, way 2 (no requests)
Note that this method does not use request or any other library, but pandas. The downside is that it reads into dataframe exactly what the url returned, so headers and everything. If you need to pre-process the json before, then the requests way above is a better option.

In [18]:
df2 = pd.read_json(query_url)
df2

URLError: <urlopen error [Errno 61] Connection refused>

## Exploring /viewing

In [19]:
pd.set_option('display.max_rows', 150)
excel_df

Unnamed: 0,GENERA,SPECIES NAME AND SYNONYMYS,DISEASE,REFERENCES
0,Kingdom Fungi,,,
1,Chytridiomycota,,,
2,Blastocladiales,,,
3,Physodermataceae,,,
4,Physoderma,Physoderma alfalfae (Pat. & Lagerh.) Karling 1950,crown wart of alfalfa,
5,,≡ Cladochytrium alfalfae Pat. & Lagerh. 1895,,
6,,≡ Urophlyctis alfalfae (Pat. & Lagerh.) Magnus...,,
7,,Physoderma maydis (Miyabe) Miyabe 1909,brown spot of maize,
8,,≡ Cladochytrium maydis Miyabe 1903,,
9,,= Physoderma zeae-maydis F.J. Shaw 1912,,


In [20]:
excel_df.columns

Index(['GENERA', 'SPECIES NAME AND SYNONYMYS', 'DISEASE', 'REFERENCES'], dtype='object')

In [21]:
excel_df.dtypes

GENERA                        object
SPECIES NAME AND SYNONYMYS    object
DISEASE                       object
REFERENCES                    object
dtype: object

In [22]:
excel_df.shape

(2619, 4)

In [23]:
excel_df.describe()

Unnamed: 0,GENERA,SPECIES NAME AND SYNONYMYS,DISEASE,REFERENCES
count,577,2313.0,469,39
unique,568,2302.0,458,31
top,Eremothecium,,heart rot,"De Beer, Z.W., Duong, T.A., Barnes, I., Wingfi..."
freq,2,3.0,3,5


In [24]:
excel_df.head()   #excel_df.head(10)

Unnamed: 0,GENERA,SPECIES NAME AND SYNONYMYS,DISEASE,REFERENCES
0,Kingdom Fungi,,,
1,Chytridiomycota,,,
2,Blastocladiales,,,
3,Physodermataceae,,,
4,Physoderma,Physoderma alfalfae (Pat. & Lagerh.) Karling 1950,crown wart of alfalfa,


In [25]:
excel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2619 entries, 0 to 2618
Data columns (total 4 columns):
GENERA                        577 non-null object
SPECIES NAME AND SYNONYMYS    2313 non-null object
DISEASE                       469 non-null object
REFERENCES                    39 non-null object
dtypes: object(4)
memory usage: 81.9+ KB


In [26]:
#### Slicing and dicing

In [29]:
excel_df[:3] # first rows

Unnamed: 0,GENERA,SPECIES NAME AND SYNONYMYS,DISEASE,REFERENCES
0,Kingdom Fungi,,,
1,Chytridiomycota,,,
2,Blastocladiales,,,


In [31]:
excel_df[-4:] # last rows

Unnamed: 0,GENERA,SPECIES NAME AND SYNONYMYS,DISEASE,REFERENCES
2615,,≡ Sorosporium scabies (Berk.) A.A. Fisch. Wald...,,
2616,,= Spongospora solani Brunch. 1887,,
2617,,= Protomyces tuber-solani Mart. 1842,,
2618,"I’ve just reduced all to the order level, rat...",,,


In [32]:
excel_df[3:5]  #from to

Unnamed: 0,GENERA,SPECIES NAME AND SYNONYMYS,DISEASE,REFERENCES
3,Physodermataceae,,,
4,Physoderma,Physoderma alfalfae (Pat. & Lagerh.) Karling 1950,crown wart of alfalfa,


## Add data to df

In [35]:
df = pd.DataFrame()
data = pd.DataFrame({"A": range(3)})
df.append(data)

Unnamed: 0,A
0,0
1,1
2,2


In [36]:
df = pd.DataFrame()
df = df.append({'name': 'Zed', 'age': 9, 'height': 2}, ignore_index=True)
df

Unnamed: 0,age,height,name
0,9.0,2.0,Zed


#### Add column

In [6]:
#df['new_col'] = df['old_col']**2

#### Delete column

In [None]:
# del df['col_to_delete']

#### Add row

In [4]:
#df.loc[3] = [1,2,3,4]  # Adds a row of ints 1,2,3,4 as a 4th row in the df. df should contain 4 columns

#### Delete row

In [7]:
# df.drop(df.index[[2]])  # drops 3rd row

#### Group by

In [None]:
#pl.groupby('some_val').mean()

## Working with

Note that adding one row at a time to df is computationally expensive, so better to use other datastructures to collect data and add it all in one go.

In [37]:
excel_df[:5]

Unnamed: 0,GENERA,SPECIES NAME AND SYNONYMYS,DISEASE,REFERENCES
0,Kingdom Fungi,,,
1,Chytridiomycota,,,
2,Blastocladiales,,,
3,Physodermataceae,,,
4,Physoderma,Physoderma alfalfae (Pat. & Lagerh.) Karling 1950,crown wart of alfalfa,


### Create a new data frame with a subset of two columns from the original

In [38]:
species_disease_df = excel_df[['SPECIES NAME AND SYNONYMYS','DISEASE']]
species_disease_df

Unnamed: 0,SPECIES NAME AND SYNONYMYS,DISEASE
0,,
1,,
2,,
3,,
4,Physoderma alfalfae (Pat. & Lagerh.) Karling 1950,crown wart of alfalfa
5,≡ Cladochytrium alfalfae Pat. & Lagerh. 1895,
6,≡ Urophlyctis alfalfae (Pat. & Lagerh.) Magnus...,
7,Physoderma maydis (Miyabe) Miyabe 1909,brown spot of maize
8,≡ Cladochytrium maydis Miyabe 1903,
9,= Physoderma zeae-maydis F.J. Shaw 1912,


#### Create a new series from one column of a dataframe

In [39]:
species_sr = excel_df['SPECIES NAME AND SYNONYMYS']
species_sr

0                                                     NaN
1                                                     NaN
2                                                     NaN
3                                                     NaN
4       Physoderma alfalfae (Pat. & Lagerh.) Karling 1950
5           ≡ Cladochytrium alfalfae Pat. & Lagerh. 1895 
6       ≡ Urophlyctis alfalfae (Pat. & Lagerh.) Magnus...
7                  Physoderma maydis (Miyabe) Miyabe 1909
8                     ≡ Cladochytrium maydis Miyabe 1903 
9                = Physoderma zeae-maydis F.J. Shaw 1912 
10                                                    NaN
11                                                    NaN
12      Synchytrium endobioticum (Schilb.) Percival 19...
13            ≡ Chrysophlyctis endobioticum Schilb. 1896 
14                      = Synchytrium solani Massee 1910 
15               Synchytrium dolichi (Cooke) Gaeum. 1927 
16                         ≡ Aecidium dolichi Cooke 1882 
17            

In [40]:
arr = []
arr.append([])
arr[0].append('aa1')
arr[0].append('aa2')
arr

[['aa1', 'aa2']]

In [41]:
import numpy as np

In [42]:
df = np.DataFrame(arr)

AttributeError: module 'numpy' has no attribute 'DataFrame'

## Duplicates

In [48]:
df = pd.read_excel("test_data/duplicates.xlsx")
df

Unnamed: 0,taxon_name,current_taxon_name,authors,source
0,Abortiporus biennis,-,(Bull.) Singer,MycoBank
1,Polyporus biennis,Abortiporus biennis,(Bulliard) Fries,MycoBank
2,Absidia anomala,-,Hesseltine & J.J. Ellis,MycoBank
3,Absidia anomala,,H. Naganishi & Hirahara,IndexFungorum
4,Absidia blakesleeana,Lichtheimia hyalospora,Lendner,MycoBank
5,Absidia californica,Absidia californica,J.J. Ellis & Hesseltine,MycoBank
6,Abortiporus biennis,Absidia coerulea,Bainier,IndexFungorum
7,Absidia corymbifera,Lichtheimia corymbifera,(Cohn) Saccardo & Trotter,MycoBank
8,Absidia cuneospora,Absidia cuneospora,G.F. Orr & Plunkett,MycoBank
9,Absidia cylindrospora var. cylindrospora,Absidia cylindrospora var. cylindrospora,,MycoBank


In [49]:
df_dupl = df.duplicated('taxon_name', keep=False)

In [50]:
df.insert(loc=4, column='taxon_name duplicated', value=df_dupl)
df

Unnamed: 0,taxon_name,current_taxon_name,authors,source,taxon_name duplicated
0,Abortiporus biennis,-,(Bull.) Singer,MycoBank,True
1,Polyporus biennis,Abortiporus biennis,(Bulliard) Fries,MycoBank,False
2,Absidia anomala,-,Hesseltine & J.J. Ellis,MycoBank,True
3,Absidia anomala,,H. Naganishi & Hirahara,IndexFungorum,True
4,Absidia blakesleeana,Lichtheimia hyalospora,Lendner,MycoBank,False
5,Absidia californica,Absidia californica,J.J. Ellis & Hesseltine,MycoBank,True
6,Abortiporus biennis,Absidia coerulea,Bainier,IndexFungorum,True
7,Absidia corymbifera,Lichtheimia corymbifera,(Cohn) Saccardo & Trotter,MycoBank,False
8,Absidia cuneospora,Absidia cuneospora,G.F. Orr & Plunkett,MycoBank,False
9,Absidia cylindrospora var. cylindrospora,Absidia cylindrospora var. cylindrospora,,MycoBank,False


In [51]:
unique_names = df['taxon_name'].unique()
unique_names.sort()
unique_names

array(['Abortiporus biennis', 'Absidia anomala', 'Absidia blakesleeana',
       'Absidia californica', 'Absidia corymbifera', 'Absidia cuneospora',
       'Absidia cylindrospora var. cylindrospora',
       'Absidia cylindrospora var. nigra',
       'Absidia cylindrospora var. rhizomorpha', 'Absidia glauca',
       'Absidia gracilis', 'Absidia griseola', 'Absidia hesseltinei',
       'Polyporus biennis'], dtype=object)

In [56]:
for name in unique_names:
    group = df.loc[df['taxon_name'] == name]
    new_group = group.drop_duplicates('current_taxon_name')
    print(new_group)
    print('--------------------')

             taxon_name   current_taxon_name         authors         source  \
0   Abortiporus biennis                    -  (Bull.) Singer       MycoBank   
6   Abortiporus biennis     Absidia coerulea         Bainier  IndexFungorum   
17  Abortiporus biennis  Absidia heterospora         Y. Ling           ICTF   

    taxon_name duplicated  
0                    True  
6                    True  
17                   True  
--------------------
        taxon_name current_taxon_name                  authors         source  \
2  Absidia anomala                  -  Hesseltine & J.J. Ellis       MycoBank   
3  Absidia anomala                NaN  H. Naganishi & Hirahara  IndexFungorum   

   taxon_name duplicated  
2                   True  
3                   True  
--------------------
             taxon_name      current_taxon_name  authors    source  \
4  Absidia blakesleeana  Lichtheimia hyalospora  Lendner  MycoBank   

   taxon_name duplicated  
4                  False  
---------

In [None]:
# Convert column format
#pd.to_numeric(agr_land_area['Value'])