In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Load Non-Structural Variants
the "data" field is a flattented 8xN matrix where each row corresponds to the count of (ATCGatgc) in the pileup at each position

*TODO* Before I run this on the nosv dataset, I should use bedtools to sort for when I want to eventually train by holding out one chromosome.

In [2]:
non_sv = pd.read_csv('data/pileups/all_nosv.bed', 
                     sep='\t', names=['chrom', 'start', 'end', 'data'])

In [3]:
non_sv.dropna(axis=0, inplace=True)

In [4]:
non_sv.head() 

Unnamed: 0,chrom,start,end,data
0,8,145625119,145625308,"0,1,0,0,0,5,0,0,0,0,0,0,0,0,0,6,0,0,0,0,6,0,0,..."
1,12,102108232,102108473,"0,0,54,0,0,0,22,0,55,0,0,0,21,0,0,0,0,55,0,0,0..."
2,19,55451706,55451884,"0,0,0,148,0,0,0,55,145,0,0,0,56,0,0,0,146,0,0,..."
3,8,87464738,87464958,"0,71,0,0,0,2,0,0,0,71,0,0,0,2,0,0,71,0,0,0,2,0..."
4,8,141407669,141407894,"0,0,86,0,0,0,13,0,0,0,87,0,0,0,13,0,0,0,0,91,0..."


### Convert to matrices

In [5]:
non_sv['data'] = non_sv['data'].apply(lambda x: np.fromstring(x, sep=',').reshape((-1, 8)).T)

In [11]:
non_sv.head().data.values[0][:, 2]

array([0., 0., 0., 0., 6., 0., 0., 0.])

In [12]:
x = (np.mean([X.shape[1] for X in non_sv['data'].values]),
np.std([X.shape[1] for X in non_sv['data'].values]),
np.median([X.shape[1] for X in non_sv['data'].values]),
np.max([X.shape[1] for X in non_sv['data'].values]),
np.min([X.shape[1] for X in non_sv['data'].values]))
print(x)
# There are mostly exon regions whose size is around 265.
# A few hundred really long ones, and a few really short ones

(255.2103133903134, 101.57453648201017, 228.0, 500, 1)


## Load Deletions

In [14]:
deletions = pd.read_csv('data/pileups/all_del.bed',
                        sep='\t', names=['chrom', 'start', 'end', 'genotype', 'data'])

In [15]:
deletions.dropna(axis=0, inplace=True)

In [16]:
len(deletions)

54001

In [18]:
deletions['data'] = deletions['data'].apply(lambda x: np.fromstring(x, sep=',').reshape((-1, 8)).T)

In [19]:
deletions.head().data.values[0].shape

(8, 272)

In [20]:
x = (np.mean([X.shape[1] for X in deletions['data'].values]),
np.std([X.shape[1] for X in deletions['data'].values]),
np.median([X.shape[1] for X in deletions['data'].values]),
np.max([X.shape[1] for X in deletions['data'].values]),
np.min([X.shape[1] for X in deletions['data'].values]))
print(x)

(255.28358734097517, 124.94022845632237, 234.0, 500, 1)


## Load Duplications

In [22]:
duplications = pd.read_csv('data/pileups/all_dup.bed',
                        sep='\t', names=['chrom', 'start', 'end', 'genotype', 'data'])

In [23]:
duplications.dropna(axis=0, inplace=True)

In [24]:
len(duplications)

21362

In [25]:
duplications['data'] = duplications['data'].apply(lambda x: np.fromstring(x, sep=',').reshape((-1, 8)).T)

In [27]:
duplications.head().data.values[0].shape

(8, 309)

In [28]:
x = (np.mean([X.shape[1] for X in duplications['data'].values]),
np.std([X.shape[1] for X in duplications['data'].values]),
np.median([X.shape[1] for X in duplications['data'].values]),
np.max([X.shape[1] for X in duplications['data'].values]),
np.min([X.shape[1] for X in duplications['data'].values]))
print(x)

(263.62732890178825, 110.96169185164786, 235.0, 500, 2)


In [29]:
pd.to_pickle(non_sv, 'data/non_sv.pkl')
pd.to_pickle(deletions, 'data/deletions.pkl')
pd.to_pickle(duplications, 'data/duplications.pkl')