In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pyreadr

In [6]:
# Import data
raw_data = pyreadr.read_r("./Data/longitudinal.RData")['proj_time']
raw_data.head()

Unnamed: 0,otu_id,count,project_id,sample_id,run_id,classification,host_id,host_age,date_collection,experiment_day,samplesite,nreads
0,103181,21.0,ERP021896,ERS1579905,ERR1868749,oralcavity,M3,32.43,2009-04-20,181,Tongue,43753.0
1,219738,19.0,ERP021896,ERS1579905,ERR1868749,oralcavity,M3,32.43,2009-04-20,181,Tongue,43753.0
2,261104,1.0,ERP021896,ERS1579905,ERR1868749,oralcavity,M3,32.43,2009-04-20,181,Tongue,43753.0
3,47921,5.0,ERP021896,ERS1579905,ERR1868749,oralcavity,M3,32.43,2009-04-20,181,Tongue,43753.0
4,5336,471.0,ERP021896,ERS1579905,ERR1868749,oralcavity,M3,32.43,2009-04-20,181,Tongue,43753.0


**project_id**: identifies projects\
**classification**: identifies the environments considered within a project\
**samplesite**: identifies the specific site from which a sample is taken\
**sample_id**: identifies samples (communities) gathered within the same environment\
**run_id**: identifies the machine run to identify species in a sample. Multiple runs are possible within the same sample\
**otu_id**: identifies species\
**count**: number of species with specific otu_id observed in a single run\
**nreads**: total number of species observed in a single run\
**host_id**: identifies host of the sample\
**host_age**: age of the host\
**date_collection**: day of sample collection in format YYY-MM-DD\
**experiment_day**: day passed from the beginning of data collection\

In [7]:
np.unique(raw_data['classification'])

array(['feces', 'oralcavity', 'skin'], dtype=object)

In [9]:
# Some environments have 'nreads' and 'count' inverted (I suppose) so they need to be fixed
mask = raw_data['nreads'] < raw_data['count']
raw_data.loc[mask, ['nreads', 'count']] = raw_data.loc[mask, ['count', 'nreads']].values

# Further analysis is done only for environments with a 'sufficient' number of samples and
# for runs which detected a 'sufficient' number of reads (I don't know if the order of these operations is important)
min_samples = 50
min_nreads = 1e4
filtered_data = raw_data.groupby('project_id').filter(lambda x: x['sample_id'].nunique() > 50)
filtered_data = filtered_data[filtered_data['nreads'] > min_nreads]

In [10]:
# Divide data for different environments
FECES = filtered_data[filtered_data['classification'] == 'feces']
ORALCAVITY = filtered_data[filtered_data['classification'] == 'oralcavity']
SKIN = filtered_data[filtered_data['classification'] == 'skin']

# Drop 'classification' column and compute frequencies
dfs = [FECES, ORALCAVITY, SKIN]

for i,df in enumerate(dfs):
    df = df.drop(columns=['classification'])
    df = df.copy()
    df['f'] = df['count'] / df['nreads']
    dfs[i] = df

# Extract all environments
# NOTE: Each environment can still have samples from different projects. Since I don't have any information about 
#       the specific methodologies used in different experiments, I treat them as statistically equivalent.
(FECES, ORALCAVITY, SKIN) = dfs