Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyard/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@
from .pyard import ARD

__author__ = """NMDP Bioinformatics"""
__version__ = '0.6.5'
__version__ = '0.6.6'
142 changes: 80 additions & 62 deletions scripts/pyard-reduce-csv
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,6 @@ def is_P(allele: str) -> bool:
return False


def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
if allele:
# Remove all white spaces
allele = white_space_regex.sub('', allele)
locus = column_name.split('_')[1].upper()
# If the allele comes in as an allele list, apply reduce to all alleles
if '/' in allele:
return "/".join(map(reduce, allele.split('/'), locus))
else:
return reduce(allele, locus)
return allele


def should_be_reduced(allele, locus_allele):
if is_serology(allele):
return ard_config["reduce_serology"]
Expand Down Expand Up @@ -96,8 +83,10 @@ def should_be_reduced(allele, locus_allele):
return False


def reduce(allele, locus):
def reduce(allele, locus, column_name):
# Does the allele name have the locus in it ?
if allele == '':
return allele
if '*' in allele:
locus_allele = allele
elif ard_config["locus_in_allele_name"]:
Expand All @@ -108,7 +97,15 @@ def reduce(allele, locus):
# Check the config if this allele should be reduced
if should_be_reduced(allele, locus_allele):
# print(f"reducing '{locus_allele}'")
reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
try:
reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
except RuntimeError as e:
if verbose:
print(e)
message = f"Failed reducing '{locus_allele}' in column {column_name}"
print(message)
failure_summary_messages.append(message)
return allele
# print(f"reduced to '{reduced_allele}'")
if reduced_allele:
if ard_config["keep_locus_in_allele_name"]:
Expand All @@ -129,6 +126,19 @@ def reduce(allele, locus):
return allele


def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
if allele:
# Remove all white spaces
allele = white_space_regex.sub('', allele)
locus = column_name.split('_')[1].upper()
# If the allele comes in as an allele list, apply reduce to all alleles
if '/' in allele:
return "/".join(map(reduce, allele.split('/'), locus, column_name))
else:
return reduce(allele, locus, column_name)
return allele


def create_drbx(row, locus_in_allele_name):
return drbx.map_drbx(row.values, locus_in_allele_name)

Expand Down Expand Up @@ -159,51 +169,59 @@ if __name__ == '__main__':
print(" pip install openpyxl")
sys.exit(1)

# Instantiate py-ard object with the latest
ard = pyard.ARD(remove_invalid=False)

# Read the Input File
# Read only the columns to be saved.
# Header is the first row
# Don't convert to NAs
df = pd.read_csv(ard_config["in_csv_filename"],
usecols=ard_config["columns_from_csv"],
header=0, dtype=str,
keep_default_na=False)

# Reduce each of the specified columns
for column in ard_config["columns_to_reduce_in_csv"]:
if verbose:
print(f"Column:{column} =>")
if ard_config["new_column_for_redux"]:
# insert a new column
new_column_name = f"reduced_{column}"
new_column_index = df.columns.get_loc(column) + 1
# Apply clean_locus function to the column and insert as a new column
df.insert(new_column_index, new_column_name,
df[column].apply(clean_locus, column_name=column))
# Instantiate py-ard object with the latest
ard = pyard.ARD(remove_invalid=False)

# Read the Input File
# Read only the columns to be saved.
# Header is the first row
# Don't convert to NAs
df = pd.read_csv(ard_config["in_csv_filename"],
usecols=ard_config["columns_from_csv"],
header=0, dtype=str,
keep_default_na=False)

failure_summary_messages = []
# Reduce each of the specified columns
for column in ard_config["columns_to_reduce_in_csv"]:
if verbose:
print(f"Column:{column} =>")
if ard_config["new_column_for_redux"]:
# insert a new column
new_column_name = f"reduced_{column}"
new_column_index = df.columns.get_loc(column) + 1
# Apply clean_locus function to the column and insert as a new column
df.insert(new_column_index, new_column_name,
df[column].apply(clean_locus, column_name=column))
else:
# Apply clean_locus function to the column and replace the column
df[column] = df[column].apply(clean_locus, column_name=column)

# Map DRB3,DRB4,DRB5 to DRBX if specified
# New columns DRBX_1 and DRBX_2 are created
if ard_config['map_drb345_to_drbx']:
drbx_loci = ['DRB3', 'DRB4', 'DRB5']
drbx_columns = [col_name for col_name in df.columns if col_name.split('_')[1] in drbx_loci]
if len(drbx_columns) == len(drbx_loci) * 2: # For Type1/Type2
locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
df_drbx = df[drbx_columns].apply(create_drbx, axis=1, args=(locus_in_allele_name,))
df['DRBX_1'], df['DRBX_2'] = zip(*df_drbx)

# Save as XLSX if specified
if ard_config["output_file_format"] == 'xlsx':
out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
df.to_excel(out_file_name, index=False)
else:
# Save as compressed CSV
out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])

if len(failure_summary_messages) == 0:
print("No Errors")
else:
# Apply clean_locus function to the column and replace the column
df[column] = df[column].apply(clean_locus, column_name=column)

# Map DRB3,DRB4,DRB5 to DRBX if specified
# New columns DRBX_1 and DRBX_2 are created
if ard_config['map_drb345_to_drbx']:
drbx_loci = ['DRB3', 'DRB4', 'DRB5']
drbx_columns = [col_name for col_name in df.columns if col_name.split('_')[1] in drbx_loci]
if len(drbx_columns) == len(drbx_loci) * 2: # For Type1/Type2
locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
df_drbx = df[drbx_columns].apply(create_drbx, axis=1, args=(locus_in_allele_name,))
df['DRBX_1'], df['DRBX_2'] = zip(*df_drbx)

# Save as XLSX if specified
if ard_config["output_file_format"] == 'xlsx':
out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
df.to_excel(out_file_name, index=False)
else:
# Save as compressed CSV
out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])

# Done
print(f"Saved result to file:{out_file_name}")
print("Summary")
print("-------")
for message in failure_summary_messages:
print("\t", message)
# Done
print(f"Saved result to file:{out_file_name}")
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.6.5
current_version = 0.6.6
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@

setup(
name='py-ard',
version='0.6.5',
version='0.6.6',
description="ARD reduction for HLA with Python",
long_description=readme + '\n\n' + history,
author="CIBMTR",
Expand Down