# Plot informative position in fasta file

### This notebook accompanies the paper "Illuminating Genetic Mysteries of the Dead Sea Scrolls"
#### Author: Moran Neuhof

The following notebook produces the figure as in Figure S5D.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
from Bio import SeqIO
from matplotlib.colors import LinearSegmentedColormap
%matplotlib inline

sns.set_context("paper")
sns.set(style="white", palette="muted", color_codes=True)
np.set_printoptions(precision=5, suppress=True) 

In [None]:
def is_value(letter):
    """The function returns 0 if the letter is '-' and 1 if the letter is an informative letter (AGCT)."""
    if letter == '-':
        return 0
    else:
        return 1

In [None]:
working_folder = None  # folder containing files

Load fasta file and specimen names file:

In [None]:
# load fasta records
fasta_fname = 'flex_collapsed_noTransitions_allGenome_16.fasta'  # input fasta file name
input_name = os.path.join(working_folder, fasta_fname)  
records = list(SeqIO.parse(input_name, "fasta"))  # reading the file


# load specimen names
specimen_filename = os.path.join(working_folder, 'titles_w_outgroup_sheep_only.csv')  # attached to this notebook
with open(specimen_filename, 'r') as infile:
    real_columns = [line.strip().split(',')[1] for line in infile]
    
# keeping a list of the specimen names  -> replaced with real_columns
index_list = [record.id for record in records]

In [None]:
# turning the fasta into a list of binary lists
list_of_binary_lists = [[is_value(letter) for letter in record.seq] for record in records]   
binary_df = pd.DataFrame(np.array(list_of_binary_lists), index=real_columns)  # saving as dataframe

In [None]:
plt.figure(figsize=(12,8), dpi=300)
plt.imshow(binary_df, aspect='auto')
plt.yticks(range(len(binary_df)), binary_df.index, fontsize=10)
plt.xlabel('position', fontsize=15)
plt.ylabel('specimen', fontsize=15)
plt.tight_layout();

# plt.savefig(os.path.join(working_folder, f"binary_positional_information_in_fasta.png"))  # saving figure