In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.gridspec as gridspec
from matplotlib import patches
import seaborn as sn
from scipy.spatial.distance import pdist
from sklearn.manifold import TSNE
from scipy.stats import friedmanchisquare, kruskal
from sklearn.cluster import k_means
from sklearn.preprocessing import normalize 
#from fancyimpute import KNN
import itertools
#from adjustText import adjust_text
import multiprocessing as mp
from numpy import savetxt
from numpy import loadtxt

## Load in data


In [2]:
%%time
#load barcodes
barcodematrix = np.genfromtxt (r'/Users/markgergues/Box Sync/MAKheirbekLab/MAPseq/data/C2_raw.csv', delimiter=',')
barcodematrix = np.array(barcodematrix, dtype=np.float64)
print(barcodematrix.shape)

(242575, 91)
CPU times: user 19.8 s, sys: 1.08 s, total: 20.9 s
Wall time: 25.2 s


In [3]:
#so you dont have to run the slow code above ^^^^ 
rawbarcodematrix = np.array(barcodematrix, dtype=np.float64)
print(rawbarcodematrix.shape)

(242575, 91)


## Preprocessing

## 1) Organize into separate animals for processing

In [4]:
## Column Re-arrangement index according to sample key
# Order as follows: PFC, NAc,LS, DLS, BNST, LH, BA, CeA, vCA1
i1 = [0,1,2,3,4,5,6,7,80]
i2 = [8,9,10,11,12,13,14,15,81]
i3 = [16,17,18,19,20,21,22,23,82]
i4 = [24,25,26,27,28,29,30,31,83]
i5 = [32,33,34,35,36,37,38,39,84]
i6 = [40,41,42,43,44,45,46,47,85]
i7 = [48,49,50,51,52,53,54,55,86]
i8 = [56,57,58,59,60,61,62,63,87]
i9 = [64,65,66,67,68,69,70,71,88]
i10 = [72,73,74,75,76,77,78,79,89]

##generates each barcodematrix for each animal with array conversion to 
##ensure new arrays generated in float.
a1 = np.array(rawbarcodematrix[:,i1], dtype=np.float64)
a2 = np.array(rawbarcodematrix[:,i2], dtype=np.float64)
a3 = np.array(rawbarcodematrix[:,i3], dtype=np.float64)
a4 = np.array(rawbarcodematrix[:,i4], dtype=np.float64)
a5 = np.array(rawbarcodematrix[:,i5], dtype=np.float64)
a6 = np.array(rawbarcodematrix[:,i6], dtype=np.float64)
a7 = np.array(rawbarcodematrix[:,i7], dtype=np.float64)
a8 = np.array(rawbarcodematrix[:,i8], dtype=np.float64)
a9 = np.array(rawbarcodematrix[:,i9], dtype=np.float64)
a10 = np.array(rawbarcodematrix[:,i10], dtype=np.float64)

print(a1.shape)
print(a2.shape)
print(a3.shape)
print(a4.shape)
print(a5.shape)
print(a6.shape)
print(a7.shape)
print(a8.shape)
print(a9.shape)
print(a10.shape)

(242575, 9)
(242575, 9)
(242575, 9)
(242575, 9)
(242575, 9)
(242575, 9)
(242575, 9)
(242575, 9)
(242575, 9)
(242575, 9)


## 2) Cleaning 

In [5]:
#remove empty barcoded cells
def clean_up_zeros():
    global a1,a2,a3,a4,a5,a6,a7,a8,a9,a10
    a1 = a1[np.any(a1, axis=1)]
    a2 = a2[np.any(a2, axis=1)]
    a3 = a3[np.any(a3, axis=1)]
    a4 = a4[np.any(a4, axis=1)]
    a5 = a5[np.any(a5, axis=1)]
    a6 = a6[np.any(a6, axis=1)]
    a7 = a7[np.any(a7, axis=1)]
    a8 = a8[np.any(a8, axis=1)]
    a9 = a9[np.any(a9, axis=1)]
    a10 = a10[np.any(a10, axis=1)]
clean_up_zeros()


#remove all zero's among targets because they are useless
a1 = a1[np.any(a1[:,0:8], axis=1)]
a2 = a2[np.any(a2[:,0:8], axis=1)]
a3 = a3[np.any(a3[:,0:8], axis=1)]
a4 = a4[np.any(a4[:,0:8], axis=1)]
a5 = a5[np.any(a5[:,0:8], axis=1)]
a6 = a6[np.any(a6[:,0:8], axis=1)]
a7 = a7[np.any(a7[:,0:8], axis=1)]
a8 = a8[np.any(a8[:,0:8], axis=1)]
a9 = a9[np.any(a9[:,0:8], axis=1)]
a10 = a10[np.any(a10[:,0:8], axis=1)]


print("Animal 1", a1.shape)
print("Animal 2", a2.shape)
print("Animal 3", a3.shape)
print("Animal 4", a4.shape)
print("Animal 5", a5.shape)
print("Animal 6", a6.shape)
print("Animal 7", a7.shape)
print("Animal 8", a8.shape)
print("Animal 9", a9.shape)
print("Animal 10", a10.shape)

# Total # of Cells
print(a1.shape[0] + a2.shape[0] + a4.shape[0] + a5.shape[0] + a6.shape[0] + a7.shape[0] + a8.shape[0] + a9.shape[0] + a10.shape[0])

Animal 1 (47, 9)
Animal 2 (1023, 9)
Animal 3 (4150, 9)
Animal 4 (3104, 9)
Animal 5 (626, 9)
Animal 6 (1640, 9)
Animal 7 (3153, 9)
Animal 8 (3301, 9)
Animal 9 (3210, 9)
Animal 10 (3148, 9)
19252


In [6]:
#remove barcodes with no source count in vCA1
rorph1 = (a1[:,8]) < 1
a1 = a1[~rorph1, :]
rorph2 = (a2[:,8]) < 1
a2 = a2[~rorph2, :]
rorph3 = (a3[:,8]) < 1
a3 = a3[~rorph3, :]
rorph4 = (a4[:,8]) < 1
a4 = a4[~rorph4, :]
rorph5 = (a5[:,8]) < 1
a5 = a5[~rorph5, :]
rorph6 = (a6[:,8]) < 1
a6 = a6[~rorph6, :]
rorph7 = (a7[:,8]) < 1
a7 = a7[~rorph7, :]
rorph8 = (a8[:,8]) < 1
a8 = a8[~rorph8, :]
rorph9 = (a9[:,8]) < 1
a9 = a9[~rorph9, :]
rorph10 = (a10[:,8]) < 1
a10 = a10[~rorph10, :]


print("Animal 1", a1.shape)
print("Animal 2", a2.shape)
print("Animal 3", a3.shape)
print("Animal 4", a4.shape)
print("Animal 5", a5.shape)
print("Animal 6", a6.shape)
print("Animal 7", a7.shape)
print("Animal 8", a8.shape)
print("Animal 9", a9.shape)
print("Animal 10", a10.shape)

# Total # of Cells
print(a1.shape[0] + a2.shape[0] + a4.shape[0] + a5.shape[0] + a6.shape[0] + a7.shape[0] + a8.shape[0] + a9.shape[0] + a10.shape[0])

Animal 1 (0, 9)
Animal 2 (150, 9)
Animal 3 (2272, 9)
Animal 4 (237, 9)
Animal 5 (23, 9)
Animal 6 (652, 9)
Animal 7 (1756, 9)
Animal 8 (1090, 9)
Animal 9 (30, 9)
Animal 10 (1869, 9)
5807


In [7]:
#Removes any barcodes with a count in negative target area (e.g. DLS)
d1 = a1[:,3] > 0
a1 = a1[~d1]
d2 = a2[:,3] > 0
a2 = a2[~d2]
d3 = a3[:,3] > 0
a3 = a3[~d3]
d4 = a4[:,3] > 0
a4 = a4[~d4]
d5 = a5[:,3] > 0
a5 = a5[~d5]
d6 = a6[:,3] > 0
a6 = a6[~d6]
d7 = a7[:,3] > 0
a7 = a7[~d7]
d8 = a8[:,3] > 0
a8 = a8[~d8]
d9 = a9[:,3] > 0
a9 = a9[~d9]
d10 = a10[:,3] > 0
a10 = a10[~d10]


print("Animal 1", a1.shape)
print("Animal 2", a2.shape)
print("Animal 3", a3.shape)
print("Animal 4", a4.shape)
print("Animal 5", a5.shape)
print("Animal 6", a6.shape)
print("Animal 7", a7.shape)
print("Animal 8", a8.shape)
print("Animal 9", a9.shape)
print("Animal 10", a10.shape)

# Total # of Cells
print(a1.shape[0] + a2.shape[0] + a4.shape[0] + a5.shape[0] + a6.shape[0] + a7.shape[0] + a8.shape[0] + a9.shape[0] + a10.shape[0])

Animal 1 (0, 9)
Animal 2 (149, 9)
Animal 3 (2259, 9)
Animal 4 (235, 9)
Animal 5 (22, 9)
Animal 6 (649, 9)
Animal 7 (1748, 9)
Animal 8 (1082, 9)
Animal 9 (30, 9)
Animal 10 (1865, 9)
5780


## Filter Thresholds

In [8]:
##Threshold filtering for magnitude 
mag = 1/10

filter1 = (np.amax(a1[:,0:8], axis = 1)/a1[:,8]) > mag
ca1 = a1[~filter1]
filter2 = (np.amax(a2[:,0:8], axis = 1)/a2[:,8]) > mag
ca2 = a2[~filter2]
filter3 = (np.amax(a3[:,0:8], axis = 1)/a3[:,8]) > mag
ca3 = a3[~filter3]
filter4 = (np.amax(a4[:,0:8], axis = 1)/a4[:,8]) > mag
ca4 = a4[~filter4]
filter5 = (np.amax(a5[:,0:8], axis = 1)/a5[:,8]) > mag
ca5 = a5[~filter5]
filter6 = (np.amax(a6[:,0:8], axis = 1)/a6[:,8]) > mag
ca6 = a6[~filter6]
filter7 = (np.amax(a7[:,0:8], axis = 1)/a7[:,8]) > mag
ca7 = a7[~filter7]
filter8 = (np.amax(a8[:,0:8], axis = 1)/a8[:,8]) > mag
ca8 = a8[~filter8]
filter9 = (np.amax(a9[:,0:8], axis = 1)/a9[:,8]) > mag
ca9 = a9[~filter9]
filter10 = (np.amax(a10[:,0:8], axis = 1)/a10[:,8]) > mag
ca10 = a10[~filter10]



print("Animal 1", ca1.shape)
print("Animal 2", ca2.shape)
print("Animal 3", ca3.shape)
print("Animal 4", ca4.shape)
print("Animal 5", ca5.shape)
print("Animal 6", ca6.shape)
print("Animal 7", ca7.shape)
print("Animal 8", ca8.shape)
print("Animal 9", ca9.shape)
print("Animal 10", ca10.shape)


# Total # of Cells
print(ca1.shape[0] + ca2.shape[0] + ca4.shape[0] + ca5.shape[0] + ca6.shape[0] + ca7.shape[0] + ca8.shape[0] + ca9.shape[0] + ca10.shape[0])

Animal 1 (0, 9)
Animal 2 (45, 9)
Animal 3 (649, 9)
Animal 4 (3, 9)
Animal 5 (2, 9)
Animal 6 (67, 9)
Animal 7 (941, 9)
Animal 8 (547, 9)
Animal 9 (4, 9)
Animal 10 (411, 9)
2020


In [9]:
##Threshold filtering for mininum # of counts in targets
minim2 = 1

filter01 = np.amax(ca1[:,0:8], axis = 1) < minim2
caa1 = ca1[~filter01]
filter02 = np.amax(ca2[:,0:8], axis = 1) < minim2
caa2 = ca2[~filter02]
filter03 = np.amax(ca3[:,0:8], axis = 1) < minim2
caa3 = ca3[~filter03]
filter04 = np.amax(ca4[:,0:8], axis = 1) < minim2
caa4 = ca4[~filter04]
filter05 = np.amax(ca5[:,0:8], axis = 1) < minim2
caa5 = ca5[~filter05]
filter06 = np.amax(ca6[:,0:8], axis = 1) < minim2
caa6 = ca6[~filter06]
filter07 = np.amax(ca7[:,0:8], axis = 1) < minim2
caa7 = ca7[~filter07]
filter08 = np.amax(ca8[:,0:8], axis = 1) < minim2
caa8 = ca8[~filter08]
filter09 = np.amax(ca9[:,0:8], axis = 1) < minim2
caa9 = ca9[~filter09]
filter010 = np.amax(ca10[:,0:8], axis = 1) < minim2
caa10 = ca10[~filter010]


print("Animal 1", caa1.shape)
print("Animal 2", caa2.shape)
print("Animal 3", caa3.shape)
print("Animal 4", caa4.shape)
print("Animal 5", caa5.shape)
print("Animal 6", caa6.shape)
print("Animal 7", caa7.shape)
print("Animal 8", caa8.shape)
print("Animal 9", caa9.shape)
print("Animal 10", caa10.shape)

print(caa1.shape[0] + caa2.shape[0] + caa4.shape[0] + caa5.shape[0] + caa6.shape[0] + caa7.shape[0] + caa8.shape[0] + caa9.shape[0] + caa10.shape[0])

Animal 1 (0, 9)
Animal 2 (45, 9)
Animal 3 (649, 9)
Animal 4 (3, 9)
Animal 5 (2, 9)
Animal 6 (67, 9)
Animal 7 (941, 9)
Animal 8 (547, 9)
Animal 9 (4, 9)
Animal 10 (411, 9)
2020


## Load in spikein excel for normalization

In [10]:
#load in spikein normalizations
spikes = pd.read_csv (r'/Users/markgergues/Box Sync/MAKheirbekLab/MAPseq/data/C2_spikes.csv', delimiter=',', header=None)
spikes = np.array(spikes, dtype=np.float64)
print(spikes.shape)

(91, 1)


In [11]:
##Spikein normalization
spikes1 = np.array(spikes[i1,:])
spikes2 = np.array(spikes[i2,:])
spikes3 = np.array(spikes[i3,:])
spikes4 = np.array(spikes[i4,:])
spikes5 = np.array(spikes[i5,:])
spikes6 = np.array(spikes[i6,:])
spikes7 = np.array(spikes[i7,:])
spikes8 = np.array(spikes[i8,:])
spikes9 = np.array(spikes[i9,:])
spikes10 = np.array(spikes[i10,:])

na1 = caa1/spikes1.T
na2 = caa2/spikes2.T
na3 = caa3/spikes3.T
na4 = caa4/spikes4.T
na5 = caa5/spikes5.T
na6 = caa6/spikes6.T
na7 = caa7/spikes7.T
na8 = caa8/spikes8.T
na9 = caa9/spikes9.T
na10 = caa10/spikes10.T

#keep floating
f_a1 = np.array(na1, dtype=np.float64)
f_a2 = np.array(na2, dtype=np.float64)
f_a3 = np.array(na3, dtype=np.float64)
f_a4 = np.array(na4, dtype=np.float64)
f_a5 = np.array(na5, dtype=np.float64)
f_a6 = np.array(na6, dtype=np.float64)
f_a7 = np.array(na7, dtype=np.float64)
f_a8 = np.array(na8, dtype=np.float64)
f_a9 = np.array(na9, dtype=np.float64)
f_a10 = np.array(na10, dtype=np.float64)


clean_up_zeros()
print("Animal 1", f_a1.shape)
print("Animal 2", f_a2.shape)
print("Animal 3", f_a3.shape)
print("Animal 4", f_a4.shape)
print("Animal 5", f_a5.shape)
print("Animal 6", f_a6.shape)
print("Animal 7", f_a7.shape)
print("Animal 8", f_a8.shape)
print("Animal 9", f_a9.shape)
print("Animal 10", f_a10.shape)

## Total # of Cells
print(f_a1.shape[0] + f_a2.shape[0] + f_a4.shape[0] + f_a5.shape[0] + f_a6.shape[0] + f_a7.shape[0] + f_a8.shape[0] + f_a9.shape[0] + f_a10.shape[0])

Animal 1 (0, 9)
Animal 2 (45, 9)
Animal 3 (649, 9)
Animal 4 (3, 9)
Animal 5 (2, 9)
Animal 6 (67, 9)
Animal 7 (941, 9)
Animal 8 (547, 9)
Animal 9 (4, 9)
Animal 10 (411, 9)
2020


In [12]:
print(ca8[459,4])
print(spikes8[4])
print(ca8[459,4]/spikes8[4])
print(na8[459,4])
print(f_a8[459,4])

1.0
[8.]
[0.125]
0.125
0.125


In [13]:
#bring all animals together
matrix = np.concatenate((f_a1,f_a2, f_a4, f_a5, f_a6, f_a7, f_a8, f_a9, f_a10), axis=0)
#keep floating
matrix = np.array(matrix, dtype=np.float64)
print(matrix.shape)

(2020, 9)


In [14]:
##Delete animals with any value in DLS
nondlsers = np.where(matrix[:,3] == 0)
datas = matrix[nondlsers]
ndata = np.array(np.delete(datas, 3, 1))
fdata = np.array(np.delete(ndata, 7, 1))
print(fdata.shape)

(2020, 7)


In [16]:
##importing and adding first cohort data
cohort1 = loadtxt(r'/Users/markgergues/Box Sync/MAKheirbekLab/MAPseq/data/C1_fdata.csv', delimiter=',')
print(cohort1.shape)
print(fdata.shape)

#combine with current data into new matrix
ffdata = np.concatenate((cohort1, fdata), axis=0)
finaldata = np.array(ffdata, dtype=np.float64)

print(finaldata.shape)

(474, 7)
(2020, 7)
(2494, 7)


In [17]:
##Normalize barcodes to sum to 1 ===== nmatrix = normalize(fdata, axis=1, norm='l1')
d1 = finaldata.copy()

for row in range(len(d1)):
 rowmax = np.amax(d1[row], axis=0)
 d1[row] = d1[row]/rowmax


print(d1.shape)
print(np.amax(d1))

(2494, 7)
1.0


In [20]:
##convert data into data frame
df = pd.DataFrame(data=d1)
df.columns = ["PFC", "NAc", "LS","BNST", "LH", "BA", "CeA"]
#save raw barcode counts post spikein normalization
savetxt('finalmatrix.csv', d1, delimiter=',')
savetxt('/Users/markgergues/Box Sync/MAKheirbekLab/MAPseq/data/finalmatrix.csv', d1, delimiter=',')
print(df.shape)
print(df)

(2494, 7)
           PFC       NAc   LS      BNST        LH        BA       CeA
0     0.000000  0.000000  0.0  0.000000  1.000000  0.000000  0.000000
1     1.000000  0.544036  0.0  0.172723  0.291750  0.128556  0.327370
2     1.000000  0.000000  0.0  0.000000  0.000000  0.000000  0.000000
3     0.000000  0.000000  0.0  0.000000  1.000000  0.881279  0.748062
4     0.565574  1.000000  0.0  0.000000  0.000000  0.945205  0.000000
...        ...       ...  ...       ...       ...       ...       ...
2489  0.000000  0.000000  1.0  0.000000  0.000000  0.000000  0.000000
2490  0.000000  0.000000  1.0  0.000000  0.000000  0.000000  0.000000
2491  0.000000  0.000000  1.0  0.000000  0.056818  0.000000  0.000000
2492  0.000000  0.000000  1.0  0.000000  0.000000  0.000000  0.000000
2493  0.000000  0.000000  1.0  0.000000  0.000000  0.000000  0.000000

[2494 rows x 7 columns]


In [19]:
print(np.amax(d1))

1.0


## END OF PREPROCESSING DATA