nmdp-bioinformatics · pbashyal-nmdp · Nov 28, 2022 · Nov 17, 2022 · Nov 17, 2022 · Nov 17, 2022
diff --git a/.gitignore b/.gitignore
@@ -116,6 +116,7 @@ venv.bak/
 .spyproject
 .idea
 .vscode
+*.swp
 
 # Rope project settings
 .ropeproject
@@ -137,3 +138,5 @@ allure_report/
 
 # cython temp files
 grim/**/*.c
+
+output/
diff --git a/Makefile b/Makefile
@@ -90,6 +90,7 @@ docker: docker-build ## build a docker image and run the service
 
 install: clean ## install the package to the active Python's site-packages
 	pip install --upgrade pip
+	python3 setup.py build_ext --inplace
 	python setup.py install
 	pip install -r requirements.txt
 	pip install -r requirements-tests.txt

diff --git a/grim/conf/README.md → conf/README.md b/grim/conf/README.md → conf/README.md
@@ -4,8 +4,8 @@
 | --- | --- |
 | populations | The population to consider them frequencies. |
 | priority | The coefficient values that define the priority matrix. |
-| loci_map| Loci full name Mapping for indexes. | 
-| freq_trim_threshold | The numerator in the frequency threshold. | 
+| loci_map| Loci full name Mapping for indexes. |
+| freq_trim_threshold | The numerator in the frequency threshold. |
 | factor_missing_data | factor to haplotype frequency in plan B in missing data case |
 | Plan_B_Matrix | matrix arranged by the most probable possibilities for recombination. The first element in the matrix should be the full haplotype. the indexes are corresponding to loci_map|
 | planb| True - use plan B anc C. False - use only Plan A. |

diff --git a/grim/conf/minimal-configuration.json → conf/minimal-configuration.json b/grim/conf/minimal-configuration.json → conf/minimal-configuration.json
@@ -1,7 +1,6 @@
 {
   "populations": [
-    "FILII",
-      "NAMER"
+    "CAU"
   ],
   "freq_trim_threshold": 1e-5,
  "priority": {
@@ -37,12 +36,15 @@
   "number_of_pop_results": 100,
   "output_MUUG": true,
   "output_haplotypes": true,
-  "graph_files_path": "output/csv" ,
+  "freq_data_dir": "data/freqs" ,
+  "pops_count_file": "graph_generation/output/pop_ratio.txt" ,
+  "freq_file": "graph_generation/output/hpf.csv" ,
+  "graph_files_path": "graph_generation/output/csv/" ,
   "node_csv_file": "nodes.csv",
   "edges_csv_file": "edges.csv",
   "info_node_csv_file": "info_node.csv",
   "top_links_csv_file": "top_links.csv",
-  "imputation_in_file": "validation/simulation/data/simulated_donor.csv",
+  "imputation_in_file": "data/subjects/donor.csv",
   "imputation_out_umug_freq_filename": "don.umug",
   "imputation_out_umug_pops_filename": "don.umug.pops",
   "imputation_out_hap_freq_filename": "don.pmug",

diff --git a/data/freqs/CAU.freqs.gz b/data/freqs/CAU.freqs.gz
diff --git a/data/subjects/donor.csv b/data/subjects/donor.csv
@@ -0,0 +1 @@
+D1,A*01:02+A*02:01/A*03:01^B*15:01+B*15:01,CAU,CAU
diff --git a/...utation/graph_generation/LICENSE_INFO.rst → graph_generation/LICENSE_INFO.rst b/...utation/graph_generation/LICENSE_INFO.rst → graph_generation/LICENSE_INFO.rst
diff --git a/grim/imputation/graph_generation/Makefile → graph_generation/Makefile b/grim/imputation/graph_generation/Makefile → graph_generation/Makefile
diff --git a/grim/imputation/graph_generation/README.bug → graph_generation/README.bug b/grim/imputation/graph_generation/README.bug → graph_generation/README.bug
@@ -21,4 +21,3 @@ $ cut -f1,2 -d',' output/csv/nemo/edges.csv |sort |uniq -c |sort -rn |more
  539 117913,117365
  539 117884,117365
  515 117918,117370
-
diff --git a/grim/imputation/graph_generation/README.md → graph_generation/README.md b/grim/imputation/graph_generation/README.md → graph_generation/README.md
@@ -13,25 +13,25 @@
 		```
 
 - Python 3
-	- On MacOS install with 
+	- On MacOS install with
 		```
 		brew install python3
 		```
 
 - Install Neo4J
-	- On MacOS install with 
+	- On MacOS install with
 		```
 		brew install neo4j
 		```
 
 	- Setup NEO4J_HOME
-    
+
         Point NEO4J_HOME to the root of the NEO4J directory.
 		```
 		export NEO4J_HOME=/usr/local/Cellar/neo4j/3.2.2/libexec
 		```
 
-### Linux 
+### Linux
 - JDK 8
 	- Install JDK 1.8 from Oracle
 	- add JAVA_HOME to ~/.bash_profile
@@ -51,11 +51,11 @@
         ```
 
 	- Point NEO4J_HOME to the root of the uncompressed NEO4J directory and add the following line to ~/.bash_profile
-    
+
 		```
 		export NEO4J_HOME=path/to/neo4j-community-3.5.7
 		```
-        
+
 
 
 # Using Makefile
@@ -99,12 +99,12 @@ make nemo
 
 To use a different set of frequencies use the following procedure:
 
-- Starting in the graph generator directory, convert the data from frequency format to hpf (haplotype, population, frequency).  
-``` 
-   python nemo_to_hpf_csv.py 
+- Starting in the graph generator directory, convert the data from frequency format to hpf (haplotype, population, frequency).
+```
+   python nemo_to_hpf_csv.py
 ```
 
-- This program looks for a data/NEMO2011 directory and reads the individual frequency files and generates this csv: 
+- This program looks for a data/NEMO2011 directory and reads the individual frequency files and generates this csv:
 ```
     output/hpf.csv
 ```
@@ -120,4 +120,3 @@ To use a different set of frequencies use the following procedure:
 	└── top_links.csv
 ```
     Note: there is an option to trim the frequency set below a frequency threshold.  If the trimming threshold is 1e-6 it will take 9m35s to generate the graph csv files on a mid-2015 MacBook Pro (2.5 GHz Intel Core i7) and will result in 1,088,817 nodes (159MB), 14,868,976 edges (2.0GB)and 5,947,591 top links (108MB).
-
diff --git a/grim/imputation/graph_generation/README.old → graph_generation/README.old b/grim/imputation/graph_generation/README.old → graph_generation/README.old
@@ -14,4 +14,3 @@ I used this one (**) as the single method to rule them all.
 It generate multiple CP links between nodes (with different values).
 This makes the graph traversal ambiguous.
 Not good.
-
diff --git a/grim/imputation/graph_generation/__init__.py → graph_generation/__init__.py b/grim/imputation/graph_generation/__init__.py → graph_generation/__init__.py
diff --git a/graph_generation/generate_hpf.py b/graph_generation/generate_hpf.py
@@ -0,0 +1,92 @@
+import csv
+import gzip
+import json
+import pathlib
+import argparse
+
+project_dir = "./"
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-c",
+    "--config",
+    required=False,
+    default="../../conf/minimal-configuration.json",
+    help="Configuration JSON file",
+    type=str,
+)
+
+args = parser.parse_args()
+configuration_file = args.config
+
+# Read configuration file and load properties
+with open(configuration_file) as f:
+    conf = json.load(f)
+
+pops = conf.get("populations")
+freq_data_dir = project_dir + conf.get("freq_data_dir")
+output_dir = project_dir + conf.get("graph_files_path")
+pop_ratio_dir = project_dir + conf.get("pops_count_file")
+# Output in HaplotypePopulationFrequency (hpf) csv file
+freq_file = project_dir + conf.get("freq_file")
+
+
+# Create output directory if it doesn't exist
+pathlib.Path(output_dir).mkdir(parents=False, exist_ok=True)
+
+# Display the configurations we are using
+print(
+    "****************************************************************************************************"
+)
+print("Conversion to HPF file based on following configuration:")
+print("\tPopulation: {}".format(pops))
+print("\tFrequency File Directory: {}".format(freq_data_dir))
+print("\tOutput File: {}".format(freq_file))
+print(
+    "****************************************************************************************************"
+)
+
+haplist_overall = {}  # list of haplotypes across all populations
+pop_hap_combos = {}
+
+list_pop_count = []
+#### Load initial frequency files
+for pop in pops:
+    in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz"
+    print("Reading Frequency File:\t {}".format(in_freq_file))
+    with gzip.open(in_freq_file, "rb") as zf:
+        count_pop = 0
+        lines = [x.decode("utf8").strip() for x in zf.readlines()]
+        for hap_line in lines:
+            haplotype, count, freq = hap_line.split(",")
+            if haplotype == "Haplo":
+                continue
+            freq = float(freq)
+            # Ignore lines with 0 freq
+            if freq == 0.0:
+                continue
+
+            pop_haplotype = pop + "-" + haplotype
+            haplist_overall[haplotype] = 1
+            pop_hap_combos[pop_haplotype] = freq
+
+            count_pop += float(count)
+        list_pop_count.append(count_pop)
+
+sum_pops = sum(list_pop_count)
+pop_ratio_file = open(pop_ratio_dir, "w")
+for pop, ratio in zip(pops, list_pop_count):
+    pop_ratio_file.write("{},{},{}\n".format(pop, ratio, (ratio / sum_pops)))
+
+
+header = ["hap", "pop", "freq"]
+
+
+print("Writing hpf File:\t {}".format(freq_file))
+with open(freq_file, mode="w") as csv_file:
+    csv_writer = csv.writer(csv_file, delimiter=",", quoting=csv.QUOTE_NONE)
+    csv_writer.writerow(header)
+    for pop_haplotype in pop_hap_combos:
+        (pop, haplotype) = pop_haplotype.split("-")
+        freq = pop_hap_combos[pop_haplotype]
+        csv_writer.writerow([haplotype, pop, freq])
diff --git a/...ph_generation/generate_neo4j_multi_hpf.py → graph_generation/generate_neo4j_multi_hpf.py b/...ph_generation/generate_neo4j_multi_hpf.py → graph_generation/generate_neo4j_multi_hpf.py
@@ -243,16 +243,10 @@ def generate_graph(
         pops = em_pop
     freq_trim = conf.get("freq_trim_threshold")
 
-    freq_file = conf.get("freq_file", "default")
-    if freq_file == "default":
-        freq_file = os.path.dirname(os.path.realpath(__file__)) + "/output/hpf.csv"
+    freq_file = conf.get("freq_file")
     dict_count_of_pop = {}
 
-    pop_ratio_dir = conf.get(
-        "pops_count_file",
-        os.path.dirname(os.path.realpath(__file__))
-        + "/imputation/graph_generation/output/pop_ratio.txt",
-    )
+    pop_ratio_dir = conf.get("pops_count_file")
     path = pathlib.Path(pop_ratio_dir)
 
     if em or not path.is_file():

diff --git a/...h_generation/generate_neo4j_single_hpf.py → ...h_generation/generate_neo4j_single_hpf.py b/...h_generation/generate_neo4j_single_hpf.py → ...h_generation/generate_neo4j_single_hpf.py
diff --git a/...ation/graph_generation/nemo_to_hpf_csv.py → graph_generation/nemo_to_hpf_csv.py b/...ation/graph_generation/nemo_to_hpf_csv.py → graph_generation/nemo_to_hpf_csv.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		D1,A01:02+A02:01/A03:01^B15:01+B*15:01,CAU,CAU
Original file line number	Diff line number	Diff line change
Expand Up		@@ -21,4 +21,3 @@ $ cut -f1,2 -d',' output/csv/nemo/edges.csv \|sort \|uniq -c \|sort -rn \|more
		539 117913,117365
		539 117884,117365
		515 117918,117370
Original file line number	Diff line number	Diff line change
Expand Up		@@ -14,4 +14,3 @@ I used this one (**) as the single method to rule them all.
		It generate multiple CP links between nodes (with different values).
		This makes the graph traversal ambiguous.
		Not good.