# Multi-Output Random Forest Regression
AIM: Identify the role of the abundances of clusters in the starting samples in determining the abundances of clusters in the final samples.

STEPS:
1. Rarefy ASV table to minimum number of reads.
2. Create input dataframes for multioutput random forest regression. Combine asv table, metadata, and cluster data so that there is are two data frames: one with the abundance of each cluster in the starting community, and another with the average abundance of each cluster across the replicates of each final community. The rows should be aligned (such that row 1 in the first data frame is for the parent sample of row 1 in the second data frame).
3. Perform multi-output random forest regression using scikit-learn.

### 1 - Rarefy to minimum number of reads

Importing libraries and data.

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
from biom import Table
from biom.util import biom_open
import matplotlib.pyplot as plt

# Importing data
asv_table = pd.read_csv('../data/seqtable_readyforanalysis.csv', index_col=0, delimiter='\t')
cluster_data = pd.read_csv("../data/max_tot_ext_network_table.tsv", delimiter='\t')
meta_data = pd.read_csv("../data/metadata_Time0D-7D-4M_May2022_wJSDpart_ext.csv", delimiter='\t')
taxonomy_data = pd.read_csv("../data/taxa_wsp_readyforanalysis.csv", delimiter='\t')

Visualising the reads to decide the rarefaction depth.

In [None]:
# Calculating number of reads
sample_read_counts = asv_table.sum(axis=1) # Number of reads of each samples
min_reads = asv_table.sum(axis=1).min() # Minimum number of reads
print(f"Minimum number of reads across samples: {min_reads}")
num_samples_below_10000 = (sample_read_counts < 10000).sum() # Number of samples with reads less than 10,000
print(f"Number of samples with less than 10,000 reads: {num_samples_below_10000}")

# Plotting number of reads
plt.hist(sample_read_counts, bins=50)
plt.xlabel('Read Counts')
plt.ylabel('Number of Samples')
plt.title('Distribution of Read Counts Across Samples')
plt.show()

Filtering out samples with less than 10,000 reads from the asv table and meta data.

In [None]:
# Filtering out samples with less than 10,000 reads and removing experiment 4M samples (different experiment)

asv_table['reads'] = asv_table.sum(axis=1) # Column with number of reads for each sample
asv_table.reset_index(inplace=True) # Making the sample ID into a column for the ASV table
asv_table.rename(columns={'index': 'sampleid'}, inplace=True) # renaming this new column to 'sampleid'

indices = asv_table.index[asv_table['reads'] < 10000].tolist() # Indices of samples with less than 10,000 reads
asv_table = asv_table.drop(indices)
meta_data = meta_data.drop(indices)

indices = meta_data.index[meta_data['Experiment'] == '4M'].tolist()
asv_table = asv_table.drop(indices)
meta_data = meta_data.drop(indices)

asv_table.set_index('sampleid', inplace=True)

Rarefying the asv table.

In [None]:
# Setting rarefaction depth to 10,000
rarefaction_depth = 10000

# Function to rarefy a sample
def rarefy_vector(v, depth):
    non_zero_indices = np.nonzero(v)[0] # Identify indices of non-zero ASVs, as these are the ones with reads
    probabilities = v[non_zero_indices] / v.sum() # Calculating probabilities of selecting each non-zero ASV
    subsampled = np.random.choice(non_zero_indices, size=depth, replace=True, p=probabilities)  # Randomly subsample the non-zero indices to depth
    rarefied = np.zeros_like(v) # New vector with the same size as the original
    np.add.at(rarefied, subsampled, 1) # Increment the counts in the rarefied vector based on the subsampling
    return rarefied

# Function to rarefy the ASV table (where rows are samples)
def rarefy_table(df, depth):
    rarefied_data = df.apply(lambda x: rarefy_vector(x.values, depth), axis=1) # Applies function that rarefies a sample to each row of the asv table
    rarefied_df = pd.DataFrame(rarefied_data.tolist(), index=df.index, columns=df.columns) # 
    return rarefied_df

# Rarefying the asv table
asv_table = rarefy_table(asv_table, rarefaction_depth)

### 2 - Create input dataframes for multioutput random forest regression

In [None]:
# Transposing so that there is a column for ASVs
transposed_asv_table = asv_table.transpose()

In [None]:
# Merging with the cluster data by ASV, such that there is a single cluster column
merged_df = transposed_asv_table.merge(cluster_data, left_index=True, right_on='ASV') # Merging by ASV
merged_df.drop(['ASV', 'Set.x'], axis=1, inplace=True) # Getting rid of unnecessary columns

In [None]:
# Grouping by cluster and transposing so that each cluster is a column with an abundance in each sample
merged_df = merged_df.groupby('functionInk').sum() # Summing all of the rows that have the same cluster value
merged_df  = merged_df.transpose() # Transposing so that clusters are columns

In [None]:
meta_data

In [None]:
d4=meta_data
# Applying the lambda function to create the 'descendant' column
d4['descendant'] = d4.apply(lambda row: (row['parent'] + '_descendant') if row['sampleid'] != row['parent'] else row['sampleid'], axis=1)

# Selecting the required columns
d4 = d4[['sampleid', 'descendant']]


# Keeping only 'sampleid' and 'descendant' columns in d4
print(d4)

In [None]:
# Merging it into the main data frame
result_df = merged_df.merge(d4, left_index=True, right_on='sampleid')
result_df.drop(['sampleid'], axis=1, inplace=True)
result_df

In [None]:
# Getting the mean of the cluster abundances across the replicates
result_df = result_df.groupby('descendant').mean()
result_df

In [None]:
# Transposing so that clusters are rows
transposed_result_df = result_df.transpose()
transposed_result_df

In [None]:
# Separating rows of parents and final samples
descendant_rows = result_df[result_df.index.str.contains('descendant')]
non_descendant_rows = result_df[~result_df.index.str.contains('descendant')]


In [None]:
# Add k to cluster columns in final samples
descendant_rows.columns = [str(col)+'k' for col in descendant_rows.columns]
descendant_rows.index = [idx.replace('_descendant', '') for idx in descendant_rows.index]
descendant_rows

In [None]:
# Add p to cluster columns in parent samples
non_descendant_rows.columns = [str(col)+'p'  for col in non_descendant_rows.columns]
non_descendant_rows

In [None]:
# Merging back into complete data frame
complete_df = non_descendant_rows.merge(descendant_rows, left_index=True, right_index=True)
complete_df

In [None]:
# 
k_columns = sorted([col for col in complete_df.columns if col.endswith('k')], key=lambda x: int(x[:-1]))
p_columns = sorted([col for col in complete_df.columns if col.endswith('p')], key=lambda x: int(x[:-1]))
parent_df = complete_df[p_columns]
parent_df 

In [None]:
descendant_df = complete_df[k_columns]
descendant_df 

# Random Forest Regression

In [None]:
# Importing modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Loading data
predictors_df = parent_df
response_df = descendant_df

# Split the data into training and testing sets
predictors_df_train, predictors_df_test, response_df_train, response_df_test = train_test_split(predictors_df, response_df, test_size=0.2, random_state=42)

# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(predictors_df_train, response_df_train)

# Make predictions on the test set
response_df_pred = model.predict(predictors_df_test)

# Evaluate the model's performance
mse = mean_squared_error(response_df_test, response_df_pred, multioutput='raw_values')
print("Mean Squared Error for each target:", mse)


In [9]:
function_data <- read.csv(file = "../data/20151016_Functions_remainder.csv", sep=',')


In [10]:
function_data

Community,Replicate,Plate,mgCO2.7,mgCO2.14,CPM7,CPM14,pgRPC.7,pgRPC.14,ATP1,⋯,ATP7,ATP14,mG7,mG14,mN7,mN14,mX7,mX14,mP7,mP14
<chr>,<int>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AE01,1,H,0.20322734,0.09567905,283600,174000,0.71659853,0.5498796,9709.8,⋯,5461.0,6594.4,15.6,7.2,8.4,4.8,5.2,5.6,486.4,446.8
AE01,2,H,0.28294331,0.53008543,65000,175800,4.35297395,3.0152755,9616.4,⋯,8188.4,7673.4,22.4,7.6,14.4,3.6,5.2,4.4,284.4,293.6
AE01,3,H,0.51798687,0.34344642,77200,105400,6.70967453,3.2585049,,⋯,6610.0,3790.4,117.2,22.8,26.4,21.6,12.8,5.6,122.8,250.8
AE01,4,H,0.02231621,0.14705997,505200,570200,0.04417303,0.2579094,3734.8,⋯,5770.0,4524.8,18.4,12.4,9.2,4.8,6.8,8.4,452.0,280.0
AE02,1,H,0.19418105,0.03626446,593400,33000,0.32723466,1.0989230,7490.0,⋯,6486.8,7807.4,37.2,17.6,17.2,12.0,15.2,10.4,702.4,506.0
AE02,2,H,0.40011826,0.27907576,41600,104400,9.61822753,2.6731394,8496.0,⋯,8255.6,7921.6,50.8,30.8,16.4,16.8,11.2,6.8,326.0,338.4
AE02,3,H,0.30696148,0.33679309,109400,72600,2.80586365,4.6390232,,⋯,6834.4,5375.6,69.6,26.0,18.0,17.6,13.6,8.0,72.4,157.2
AE02,4,H,0.02231621,0.13756921,491800,743600,0.04537660,0.1850043,2557.2,⋯,7939.0,5348.2,30.8,24.8,12.0,11.6,8.4,12.4,1093.6,604.8
AE03,1,H,0.27524630,0.05803298,119000,52800,2.31299414,1.0991095,4396.0,⋯,4266.0,4260.2,17.2,20.0,6.8,15.6,6.8,6.8,844.8,708.0
AE03,2,H,0.49729326,0.21897101,37000,34000,13.44035832,6.4403239,4697.4,⋯,4786.6,4983.6,16.8,11.2,7.6,8.0,5.6,5.2,435.2,357.6
