In [1]:
import pandas as pd
from AtlasScraper import scrape_fly_atlas2

# Scrape Fly Atlast 2

The below command will read in a CSV file (example_gene_list.csv) and query the Fly Atlas 2 website/database for the corresponding gene. The collection of gene data will then be stored in a file. In this case specified as output_1.csv.

In [2]:
scrape_fly_atlas2('example_gene_list.csv', 'output_1.csv')

1 of 12 requested. 8.3% complete.
2 of 12 requested. 16.7% complete.
3 of 12 requested. 25.0% complete.
4 of 12 requested. 33.3% complete.
5 of 12 requested. 41.7% complete.
6 of 12 requested. 50.0% complete.
7 of 12 requested. 58.3% complete.
8 of 12 requested. 66.7% complete.
9 of 12 requested. 75.0% complete.
10 of 12 requested. 83.3% complete.
11 of 12 requested. 91.7% complete.
12 of 12 requested. 100.0% complete.


# Read in data and clean up table

The scraped data needs to be tidied up for further analysis as it is not in a datatable format. First need to read in and split by new line, then split by tab delimiter and expand the columns out.

In [3]:
df = pd.read_csv('output_1.csv', header=None, sep='\n', encoding = 'utf-8')

In [4]:
df = df[0].str.split('\t', expand=True)

## Clean up data table

### Rename Columns

In [5]:
column_names = [
    "Tissue",
    "Adult Male FPKM",
    "Adult Male SD",
    "Adult Male Enrichment",
    "Adult Female FPKM",
    "Adult Female SD",
    "Adult Female Enrichment",
    "Male v. Female M/F",
    "Male v. Female p value",
    "Larval FPKM",
    "Larval SD",
    "Larval Enrichment",
]
df.columns = column_names

### Extract FlyBase ID, Annotation Symbol, Symbol and Name

In [6]:
df.loc[df.Tissue == "FlyBase ID", "FlyBase ID"] = df["Adult Male FPKM"]
df.loc[df.Tissue == "Annotation Symbol", "Annotation Symbol"] = df["Adult Male FPKM"]
df.loc[df.Tissue == "Symbol", "Symbol"] = df["Adult Male FPKM"]
df.loc[df.Tissue == "Name", "Name"] = df["Adult Male FPKM"]

### Fill forward the new columns

In [7]:
columns_to_fill_forward = ['FlyBase ID', 'Annotation Symbol', 'Symbol', 'Name']
for name in columns_to_fill_forward:
    df[name] = df[name].fillna(method='ffill')

### Remove errors and unnecessary rows

In [8]:
remove_rows_containing = ["An error has occurred.",
                          "FlyBase ID",
                          "Annotation Symbol",
                          "Symbol", 
                          "Name", 
                          "Tissue",
                         ""]
df = df[~df.Tissue.isin(remove_rows_containing)]

In [9]:
df.head(10)

Unnamed: 0,Tissue,Adult Male FPKM,Adult Male SD,Adult Male Enrichment,Adult Female FPKM,Adult Female SD,Adult Female Enrichment,Male v. Female M/F,Male v. Female p value,Larval FPKM,Larval SD,Larval Enrichment,FlyBase ID,Annotation Symbol,Symbol,Name
7,Whole body,656.92,18.34,1.0,1301.8,157.3,1.0,0.5,p > 0.05,1498.4,178.6,1,FBgn0024733,CG17521,RpL10,Ribosomal protein L10
8,Head,853.25,77.94,1.3,1545.94,255.86,1.19,0.55,p > 0.05,-,-,-,FBgn0024733,CG17521,RpL10,Ribosomal protein L10
9,Eye,708.8,121.91,1.08,1607.38,151.26,1.23,0.44,n.s.,-,-,-,FBgn0024733,CG17521,RpL10,Ribosomal protein L10
10,Brain / CNS,380.67,27.05,0.58,467.4,92.1,0.36,0.81,n.s.,1068.12,57.11,0.71,FBgn0024733,CG17521,RpL10,Ribosomal protein L10
11,Thoracicoabdominal ganglion,726.38,31.6,1.11,737.81,25.77,0.57,0.98,n.s.,-,-,-,FBgn0024733,CG17521,RpL10,Ribosomal protein L10
12,Crop,1059.34,21.4,1.61,1359.95,6.46,1.04,0.78,n.s.,-,-,-,FBgn0024733,CG17521,RpL10,Ribosomal protein L10
13,Midgut,1033.11,99.54,1.57,1533.4,174.08,1.18,0.67,p > 0.05,1026.52,66.24,0.69,FBgn0024733,CG17521,RpL10,Ribosomal protein L10
14,Hindgut,1075.3,29.37,1.64,1263.62,81.04,0.97,0.85,p > 0.05,1470.42,290.4,0.98,FBgn0024733,CG17521,RpL10,Ribosomal protein L10
15,Malpighian Tubules,1107.58,78.83,1.69,1499.63,85.33,1.15,0.74,p > 0.01,1159.4,71.25,0.77,FBgn0024733,CG17521,RpL10,Ribosomal protein L10
16,Fat body,949.83,33.19,1.45,1375.22,119.37,1.06,0.69,p > 0.05,3290.06,261.23,2.2,FBgn0024733,CG17521,RpL10,Ribosomal protein L10


# Output file to CSV

The below command outputs the processed datatable to a csv file for analysis in other software if needed.

In [10]:
df.to_csv('processed_data.csv')