Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ venv.bak/
.spyproject
.idea
.vscode
*.swp

# Rope project settings
.ropeproject
Expand All @@ -137,3 +138,5 @@ allure_report/

# cython temp files
grim/**/*.c

output/
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ docker: docker-build ## build a docker image and run the service

install: clean ## install the package to the active Python's site-packages
pip install --upgrade pip
python3 setup.py build_ext --inplace
python setup.py install
pip install -r requirements.txt
pip install -r requirements-tests.txt
Expand Down
4 changes: 2 additions & 2 deletions grim/conf/README.md → conf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
| --- | --- |
| populations | The population to consider them frequencies. |
| priority | The coefficient values that define the priority matrix. |
| loci_map| Loci full name Mapping for indexes. |
| freq_trim_threshold | The numerator in the frequency threshold. |
| loci_map| Loci full name Mapping for indexes. |
| freq_trim_threshold | The numerator in the frequency threshold. |
| factor_missing_data | factor to haplotype frequency in plan B in missing data case |
| Plan_B_Matrix | matrix arranged by the most probable possibilities for recombination. The first element in the matrix should be the full haplotype. the indexes are corresponding to loci_map|
| planb| True - use plan B anc C. False - use only Plan A. |
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"populations": [
"FILII",
"NAMER"
"CAU"
],
"freq_trim_threshold": 1e-5,
"priority": {
Expand Down Expand Up @@ -37,12 +36,15 @@
"number_of_pop_results": 100,
"output_MUUG": true,
"output_haplotypes": true,
"graph_files_path": "output/csv" ,
"freq_data_dir": "data/freqs" ,
"pops_count_file": "graph_generation/output/pop_ratio.txt" ,
"freq_file": "graph_generation/output/hpf.csv" ,
"graph_files_path": "graph_generation/output/csv/" ,
"node_csv_file": "nodes.csv",
"edges_csv_file": "edges.csv",
"info_node_csv_file": "info_node.csv",
"top_links_csv_file": "top_links.csv",
"imputation_in_file": "validation/simulation/data/simulated_donor.csv",
"imputation_in_file": "data/subjects/donor.csv",
"imputation_out_umug_freq_filename": "don.umug",
"imputation_out_umug_pops_filename": "don.umug.pops",
"imputation_out_hap_freq_filename": "don.pmug",
Expand Down
Binary file added data/freqs/CAU.freqs.gz
Binary file not shown.
1 change: 1 addition & 0 deletions data/subjects/donor.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
D1,A*01:02+A*02:01/A*03:01^B*15:01+B*15:01,CAU,CAU
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,3 @@ $ cut -f1,2 -d',' output/csv/nemo/edges.csv |sort |uniq -c |sort -rn |more
539 117913,117365
539 117884,117365
515 117918,117370

Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,25 @@
```

- Python 3
- On MacOS install with
- On MacOS install with
```
brew install python3
```

- Install Neo4J
- On MacOS install with
- On MacOS install with
```
brew install neo4j
```

- Setup NEO4J_HOME

Point NEO4J_HOME to the root of the NEO4J directory.
```
export NEO4J_HOME=/usr/local/Cellar/neo4j/3.2.2/libexec
```

### Linux
### Linux
- JDK 8
- Install JDK 1.8 from Oracle
- add JAVA_HOME to ~/.bash_profile
Expand All @@ -51,11 +51,11 @@
```

- Point NEO4J_HOME to the root of the uncompressed NEO4J directory and add the following line to ~/.bash_profile

```
export NEO4J_HOME=path/to/neo4j-community-3.5.7
```



# Using Makefile
Expand Down Expand Up @@ -99,12 +99,12 @@ make nemo

To use a different set of frequencies use the following procedure:

- Starting in the graph generator directory, convert the data from frequency format to hpf (haplotype, population, frequency).
```
python nemo_to_hpf_csv.py
- Starting in the graph generator directory, convert the data from frequency format to hpf (haplotype, population, frequency).
```
python nemo_to_hpf_csv.py
```

- This program looks for a data/NEMO2011 directory and reads the individual frequency files and generates this csv:
- This program looks for a data/NEMO2011 directory and reads the individual frequency files and generates this csv:
```
output/hpf.csv
```
Expand All @@ -120,4 +120,3 @@ To use a different set of frequencies use the following procedure:
└── top_links.csv
```
Note: there is an option to trim the frequency set below a frequency threshold. If the trimming threshold is 1e-6 it will take 9m35s to generate the graph csv files on a mid-2015 MacBook Pro (2.5 GHz Intel Core i7) and will result in 1,088,817 nodes (159MB), 14,868,976 edges (2.0GB)and 5,947,591 top links (108MB).

Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,3 @@ I used this one (**) as the single method to rule them all.
It generate multiple CP links between nodes (with different values).
This makes the graph traversal ambiguous.
Not good.

92 changes: 92 additions & 0 deletions graph_generation/generate_hpf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import csv
import gzip
import json
import pathlib
import argparse

project_dir = "./"

parser = argparse.ArgumentParser()
parser.add_argument(
"-c",
"--config",
required=False,
default="../../conf/minimal-configuration.json",
help="Configuration JSON file",
type=str,
)

args = parser.parse_args()
configuration_file = args.config

# Read configuration file and load properties
with open(configuration_file) as f:
conf = json.load(f)

pops = conf.get("populations")
freq_data_dir = project_dir + conf.get("freq_data_dir")
output_dir = project_dir + conf.get("graph_files_path")
pop_ratio_dir = project_dir + conf.get("pops_count_file")
# Output in HaplotypePopulationFrequency (hpf) csv file
freq_file = project_dir + conf.get("freq_file")


# Create output directory if it doesn't exist
pathlib.Path(output_dir).mkdir(parents=False, exist_ok=True)

# Display the configurations we are using
print(
"****************************************************************************************************"
)
print("Conversion to HPF file based on following configuration:")
print("\tPopulation: {}".format(pops))
print("\tFrequency File Directory: {}".format(freq_data_dir))
print("\tOutput File: {}".format(freq_file))
print(
"****************************************************************************************************"
)

haplist_overall = {} # list of haplotypes across all populations
pop_hap_combos = {}

list_pop_count = []
#### Load initial frequency files
for pop in pops:
in_freq_file = freq_data_dir + "/" + pop + ".freqs.gz"
print("Reading Frequency File:\t {}".format(in_freq_file))
with gzip.open(in_freq_file, "rb") as zf:
count_pop = 0
lines = [x.decode("utf8").strip() for x in zf.readlines()]
for hap_line in lines:
haplotype, count, freq = hap_line.split(",")
if haplotype == "Haplo":
continue
freq = float(freq)
# Ignore lines with 0 freq
if freq == 0.0:
continue

pop_haplotype = pop + "-" + haplotype
haplist_overall[haplotype] = 1
pop_hap_combos[pop_haplotype] = freq

count_pop += float(count)
list_pop_count.append(count_pop)

sum_pops = sum(list_pop_count)
pop_ratio_file = open(pop_ratio_dir, "w")
for pop, ratio in zip(pops, list_pop_count):
pop_ratio_file.write("{},{},{}\n".format(pop, ratio, (ratio / sum_pops)))


header = ["hap", "pop", "freq"]


print("Writing hpf File:\t {}".format(freq_file))
with open(freq_file, mode="w") as csv_file:
csv_writer = csv.writer(csv_file, delimiter=",", quoting=csv.QUOTE_NONE)
csv_writer.writerow(header)
for pop_haplotype in pop_hap_combos:
(pop, haplotype) = pop_haplotype.split("-")
freq = pop_hap_combos[pop_haplotype]
csv_writer.writerow([haplotype, pop, freq])
Original file line number Diff line number Diff line change
Expand Up @@ -243,16 +243,10 @@ def generate_graph(
pops = em_pop
freq_trim = conf.get("freq_trim_threshold")

freq_file = conf.get("freq_file", "default")
if freq_file == "default":
freq_file = os.path.dirname(os.path.realpath(__file__)) + "/output/hpf.csv"
freq_file = conf.get("freq_file")
dict_count_of_pop = {}

pop_ratio_dir = conf.get(
"pops_count_file",
os.path.dirname(os.path.realpath(__file__))
+ "/imputation/graph_generation/output/pop_ratio.txt",
)
pop_ratio_dir = conf.get("pops_count_file")
path = pathlib.Path(pop_ratio_dir)

if em or not path.is_file():
Expand Down
Loading