# Methanotrophy

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from utils import * 
from matrix import *
from plot import *
import os

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Loading data

In [3]:
DATA_DIR = '/home/prichter/Documents/data/methanotrophy'

In [4]:
metadata_df = dataframe_from_metadata(os.path.join(DATA_DIR, 'metadata.csv'))
counts_df = dataframe_from_counts(os.path.join(DATA_DIR, 'counts.tsv'))
taxonomy_df = dataframe_from_taxonomy(os.path.join(DATA_DIR, 'taxonomy.tsv'))

print('Fields in metadata.csv:', ', '.join(metadata_df.columns))
print('Fields in counts.tsv:', ', '.join(counts_df.columns))
print('Fields in taxonomy.tsv:', ', '.join(taxonomy_df.columns))

Fields in metadata.csv: sample, serial_code, soil_depth, site, season, flux_ch4, temp_air, temp_soil, water_content, bulk_density
Fields in counts.tsv: asv, sample, count
Fields in taxonomy.tsv: domain, phylum, class, order, family, genus, species, domain_sub, phylum_sub, class_sub, order_sub, family_sub, genus_sub, species_sub, asv


In [5]:
missing_samples = set(counts_df['sample'].values) - set(metadata_df['sample'].values)
print(f"Samples {', '.join(list(missing_samples))} are present in the counts.tsv file, but not in metadata.")
print(f"Dropping {counts_df['sample'].isin(missing_samples).values.sum()} rows in the counts data which do not have associated metadata." )
counts_df = counts_df[~counts_df['sample'].isin(missing_samples)] # Remove rows in counts_df which don't have associated metadata.

# Combine the data across files into a single DataFrame. Drop the samples column, which is redundant with the serial code. 
df = counts_df.merge(metadata_df, on=['sample']).merge(taxonomy_df, on=['asv'], how='left').drop(columns='sample')

Samples HDK22-KML-sand-wet, HDK22-KML-sand-dry are present in the counts.tsv file, but not in metadata.
Dropping 192136 rows in the counts data which do not have associated metadata.


## Analysis

### Are there any relationships between taxa and methane flux?

In [6]:
# Generating the count matrix is currently extremely slow. Possibly a way to speed it up?
m = AsvMatrix(df)

### Which environmental factors are predictive of methane flux?

In [15]:
plot_rarefaction_curves(m, path='/home/prichter/Documents/methanotrophy/figures')

plot.plot_rarefaction_curves:   0%|          | 0/103 [00:00<?, ?it/s]

plot.plot_rarefaction_curves:  39%|███▉      | 40/103 [01:34<04:44,  4.51s/it]