In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from scipy.stats import pmean

In [None]:
taxonomic_levels = {"d": 0, "p": 1, "c": 2, "o": 3, "f": 4, "g": 5, "s": 6}

In [None]:
def fix_index(df: pd.DataFrame, index_col: int) -> pd.DataFrame:
    if index_col == 0:
        df.set_index(df.columns[0], inplace=True)
        df.drop(columns=['even_stag', 'run', 'in_out', 'v_region'], inplace=True)
        return df

    else:
        df = df.iloc[:, index_col:]
        df.set_index(df.columns[0], inplace=True)
        return df

def determine_taxonomic_rank(split_classification: list, taxon_level: str) -> str:
    if len(split_classification) == taxonomic_levels[taxon_level]+1:
        return split_classification[-1]
    else:
        return "Assigned_Higher"
    
def clean_columns(df: pd.DataFrame, delimiter_pattern=";", transpose=False, taxon_level="g"):
    new_labels = []
    data = []
    if transpose:
        data = df.columns.to_list()
    else:
        data = df.index.to_list()

    for c in data:
        print(c)
        pattern1 = re.compile(delimiter_pattern)
        col6 = re.split(pattern1, c)
        print(col6)

        org_name = determine_taxonomic_rank(col6, taxon_level)

        # pattern2 = re.compile(r'')
        # col6_clean = re.split(pattern2, col6)[-1]

        if org_name == '__' or org_name == f'{taxon_level}__':
            new_labels.append("Assigned_Higher")
            continue

        new_labels.append(org_name)

    # print(new_cols)
    if transpose:
        df.columns = new_labels
        display(df.head())
        return df 
    else:
        df.index = new_labels
        display(df.head())
        return df


def clean_csv(root, f, delimiter_pattern=";", transpose=False, index_col=0, taxon_level="g"):
    print(f)
    df = pd.read_csv(os.path.join(root, f))
    # display(df)

    df = fix_index(df, index_col)
    display(df.head())

    clean_columns(df, delimiter_pattern, transpose, taxon_level)

    if transpose:
        deduped = df.groupby(lambda x:x, axis=1).sum()
        return deduped.T

    else:
        deduped = df.groupby(lambda x:x, axis=0).sum()
        return deduped
        
cleaned_df = []

silva_dir = '/Volumes/TBHD/Bioinformatics/ion_torrent_qiime2_methods_manuscript/Classified_Tax_Counts_QIIME2_CSV_Files/Cutprimers/Species'
set_2_dir = '/Volumes/TBHD_share/multi_v_regions_tables/FirstTestSet_stool/data'

for root, dirs, file in os.walk(set_2_dir):
    for f in file:
        # if "Silva" in f:
        cleaned_df.append(clean_csv(root, f, delimiter_pattern=",", transpose=False, index_col=1, taxon_level="g"))
        # break

In [None]:
def extra_column_cleaning(df):
    cols = df.columns.to_list()
    new_cols = []
    for c in cols:
        pattern = re.compile(r'_V\d')
        splitted = re.split(pattern, c)
        new_cols.append(splitted[0])

    df.columns = new_cols
    return df

extra_clean = [extra_column_cleaning(df) for df in cleaned_df]
# df1 = extra_clean[0].iloc[0:5, 0:8]
# display(df1.head())
# df2 = extra_clean[1].iloc[0:5, 0:8]
# df3 = extra_clean[2].iloc[0:5, 0:8]

# print(df1.shape)
# print(df2.shape)
# print(len(extra_clean))
out = pd.concat(extra_clean, axis=0)
# display(out)

groups = out.groupby(level=0, axis=0)
c = 0

final = pd.DataFrame()
for g in groups:
    # print(g[0])
    # display(g[1])
    m = pd.DataFrame(pmean(g[1], axis=0, p=2, nan_policy='omit')).T
    m.columns = g[1].columns
    m.index = [g[0]]

    final = pd.concat([final, m], axis=0)

    # display(m)
    # c+=1
    # if c == 10:
        # break

display(final.head())
final.to_csv("final.csv")