# Turn Similarity/Positions Matrix into a circos plot

### This notebook accompanies the paper "Illuminating Genetic Mysteries of the Dead Sea Scrolls"
#### Author: Moran Neuhof

The following notebook produces the figure as in Figure 7.

In [1]:
# importing
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

folder_join = os.path.join  # alias

In [2]:
working_dir = r"circos" # folder of input/output files
pos_matrix = "pos_matrix.xlsx"  # shared positions matrix
sim_matrix = "sim_matrix.xlsx"  # similarity matrix
pos_matrix_filename = folder_join(working_dir, pos_matrix)
sim_matrix_filename = folder_join(working_dir, sim_matrix)

In [3]:
def fix_columns_and_index(df):
    """Replacing duplicate '' in column and index names"""
    df.columns = [x.replace('\'','') for x in df.columns.astype(str)]
    df.index = [x.replace('\'','') for x in df.index.astype(str)]
    return df

In [4]:
# loading the matrices, fixing them nicely to remove '' flanking them
pos_df = fix_columns_and_index(pd.read_excel(pos_matrix_filename))
sim_df = fix_columns_and_index(pd.read_excel(sim_matrix_filename))

In [5]:
# loading and creating a list of column names:
specimen_filename = os.path.join(working_dir, 'titles_w_outgroup_sheep_only_for_circos.csv')  # File available with notebook
with open(specimen_filename, 'r') as infile:
    real_columns = [line.strip().split(',')[1] for line in infile]

Calculate number of similar positions:

In [9]:
similar_positions_df = fix_columns_and_index(pos_df.multiply(sim_df))  # number of positions * portion of shared positions

### Creating a "genome file"

For the "genome file", we will want a reference that holds the diagonal of the positions matrix:

In [10]:
max_pos_per_sample_df = pd.Series(np.diag(pos_df)*10, index=[x.replace('\'','') for x in pos_df.index.astype(str)], name="Positions")
max_pos_per_sample_df = pd.Series(np.diag(pos_df)*10, index=real_columns, name="Positions")
max_pos_per_sample_df.index.name = 'Specimen'

In [15]:
# sorting the real columns
real_columns_sorted = ['Tibetan Sheep',
                       'dss001-unknown',
                       'dss002-unknown',
                       'dss003-unknown',
                       '4Q57 frg. 7V',
                       '4Q57 frg. 9V',
                       '11Q17',
                       'Mas1k',
                       '4Q404 frg. 5V',
                       'WS(4or7)',
                       '4Q404 frg. 8V',
                       '4Q405-36', 
                       '4Q57 frg. 6V',
                       '4Q59 frg. 25',
                       '4Q37 frg. 25',
                       '4Q72a',
                       '4Q37 frg. 28',
                       '4Q344',
                       'Hev/Se6',
                       'garment-99-9035',
                       '4Q57 frg. 25',
                       'WS4',
                       'WS7',
                       '4Q71-t',
                       '4Q404 frg. 9V',
                       '4Q57 frg. 12',
                       '4Q405-518',
                       '4Q71-nt',
                       'NewScroll-38',
                       'NewScroll-39',
                       'NewScroll-40']

# Removing modern/unknown specimens from display
scrolls_to_leave_out_of_index = ['dss001-unknown',
                                 'dss002-unknown',
                                 'dss003-unknown',
                                 'Tibetan Sheep',
                                 'NewScroll-38',
                                 'NewScroll-39',
                                 'NewScroll-40']

Preparing the genome file in the right format:

In [17]:
# Showing the list, but organized:
max_pos_per_sample_df_sorted = max_pos_per_sample_df[real_columns_sorted]
max_pos_per_sample_df_sorted_short = max_pos_per_sample_df_sorted.drop(scrolls_to_leave_out_of_index)
max_pos_per_sample_df_sorted_short.to_csv(folder_join(working_dir,"circa_chr_file_sorted_short.tsv"), sep='\t', header='True')

### Creating connections file

Plotting connections:
Output .bed file will look like this:  
```
sample1    shared_pos     sample2     shared_pos
```
This file contains the information about the connection between each to samples.

In [21]:
# upper triangle
upper_tri_df = fix_columns_and_index(pd.DataFrame(np.triu(similar_positions_df, k=1), index=real_columns, columns=real_columns) )  # changed names

sample_names = upper_tri_df.columns.tolist()
upper_tri_df['sample_name'] = upper_tri_df.index

In [22]:
# "melting" the matrix
melted_df = pd.melt(upper_tri_df, id_vars='sample_name', value_vars=sample_names, var_name='sample2', value_name='shared_pos')
melted_df = melted_df[melted_df['shared_pos'] != 0]

In [28]:
# Ordered list (for aesthetics)
sample_line_ordered_list = ['4Q57 frg. 7V',
                            '4Q57 frg. 9V',
                            '11Q17',
                            'Mas1k',
                            '4Q404 frg. 5V',
                            '4Q57 frg. 6V',
                            '4Q59 frg. 25',
                            '4Q37 frg. 25',
                            '4Q72a',
                            '4Q37 frg. 28',
                            '4Q344',
                            'Hev/Se6',
                            'garment-99-9035',
                            '4Q57 frg. 25',
                            '4Q57 frg. 12',
                            'WS4',
                            'WS7',
                            'WS(4or7)',
                            '4Q404 frg. 9V',
                            '4Q404 frg. 8V',
                            '4Q405-518',
                            '4Q405-36', 
                            '4Q71-t',
                            '4Q71-nt']

In [29]:
# creating an array of random positions on the "sample" 
# a list of our samples
sample_list = [x for x in sample_line_ordered_list]  # updated to reorganize the order of fragments

cols = ['sample1', 'shared_pos1', 'sample2', 'shared_pos2']
frames = []

portions = [0.01, 0.1, 0.05, 0.005]  # different portions or the connections to display

for portion_of_pos_to_keep in portions:  # for each of the portions
    for sample in sample_list:  # iterate over samples
        for sample2 in sample_list:   # and another sample
            if sample == sample2:
                continue  # skipping the duplicates

            # the lists are not simmetrical because we are only using the upper triangle
            # doing a portion of positions
            pos_number = int(int(melted_df[(melted_df.sample_name == sample) & (melted_df.sample2 == sample2)]['shared_pos']) * portion_of_pos_to_keep)  
            
            # finding random positions for visual representation
            random_pos_sample = np.random.randint(0, high=max_pos_per_sample_df[sample], size=pos_number)
            random_pos_sample2 = np.random.randint(0, high=max_pos_per_sample_df[sample2], size=pos_number)
            # turning them into a dataframe
            samples_explicit_df = pd.DataFrame(
                {'sample1': [sample]*pos_number,
                 'shared_pos1': random_pos_sample,
                 'sample2': [sample2]*pos_number,
                 'shared_pos2': random_pos_sample2
                })[cols]
            # appending all dataframes to a list
            frames.append(samples_explicit_df)
    
    # Concatenating the dataframes
    rand_pos_df = pd.concat(frames)
    rand_pos_df.to_csv(folder_join(working_dir,"circa_relations_file_ind_positions_no_outgroup.{}.tsv".format(portion_of_pos_to_keep)), sep='\t', header=True, index=False)

The files are now read to be used with [Circa](http://omgenomics.com/circa/).