# Processing the RPSBlastP results

In [None]:
%run ../config/init.py

In [None]:
# Downloading CDD database
if not os.path.exists(os.path.join(DATA, 'family_superfamily_links')):
    os.chdir(DATA)
    !wget https://ftp.ncbi.nih.gov/pub/mmdb/cdd/family_superfamily_links
families = pandas.read_csv(os.path.join(DATA, 'family_superfamily_links'), sep='\t', header=None)
families = families.rename(columns={0:'family', 1:'CDDID', 2:'superfamily', 3:'superCDDID'})
families.head()

In [None]:
TRANSCRIPTOME_FILE = os.path.join(RESULTS, DATASET, 'trinity_assembly', 'Trinity.fasta.gz')
data = []
with gzip.open(TRANSCRIPTOME_FILE, "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        data.append([record.id, len(record.seq)])
trans_len = pandas.DataFrame(data)

### Loading sample list from GCP operations

In [None]:
samples = pandas.read_csv(os.path.join(RESULTS, DATASET, 'annotation','gcp', 'operations.tsv'), sep='\t')
samples = samples['sample']
bar_length = len(samples)

### Loading RPSBlastP results 

In [None]:
%%time

rps = pandas.DataFrame()
data = []
total = 0
j = 1
for s in samples:
    fasta = {}
    with gzip.open(os.path.join(RESULTS, DATASET, 'annotation', 'blasts', s, s + '_nocont_transdecoder.fsa.gz'), "rt") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            f = record.id.split('.')
            c = fasta.setdefault(f[0], {})
            c[f[1]] = { 'strand':record.description[-2:-1]}
    d = pandas.read_csv(os.path.join(RESULTS, DATASET, 'annotation', 'blasts', s, s + '_nocont_rpsblast.tsv.gz'), sep='\t', header=None)
    new = d[0].str.split(".", n = 1, expand = True)
    d[0] = new[0]
    d[6] = new[1]
    new = d[2].str.split(":", n = 1, expand = True)
    d[2] = new[1]
    d = d.rename(columns={2:'C'})
    d = d.assign(C=d.C.astype(int))
    d = d.rename(columns={'C':2})
    d = d.drop_duplicates()
    total += len(d)
    for k, v in fasta.items():
        if len(v) > 1:
            minus = 0
            minus_l = []
            plus = 0
            plus_l = []
            for i, r in d[d[0] == k][[0,6]].groupby(6).count().reset_index().iterrows():
                fasta[k][r[6]]['hits'] = r[0]
                if fasta[k][r[6]]['strand'] == '+':
                    plus += r[0]
                    plus_l.append(r[6])
                else:
                    minus += r[0]
                    minus_l.append(r[6])
            if plus > minus and minus > 0:
                for p in minus_l:
                    d = d.drop(d[(d[0] == k) & (d[6] == p)].index)
            elif minus > plus and plus > 0:
                for p in plus_l:
                    d = d.drop(d[(d[0] == k) & (d[6] == p)].index)
    
    dt = pandas.read_csv(os.path.join(RESULTS, DATASET, 'annotation', 'blasts', s, s + '_nocont_rpstblastn.tsv.gz'), sep='\t', header=None)
    new = dt[2].str.split(":", n = 1, expand = True)
    dt[2] = new[1]
    dt = dt.rename(columns={2:'C'})
    dt = dt.assign(C=dt.C.astype(int))
    dt = dt.rename(columns={'C':2})
    dt[6] = ''
    rps = pandas.concat([rps, d, dt])   
    text = "{0:8} [{1}] {2}/{3}".format(s, "#" * j + "-" * (bar_length - j), len(rps), total)
    print(text, end='\r')
    j += 1
    del d
print('\n')
rps = rps.merge(families, left_on=2, right_on='CDDID')[[0,'family', 'CDDID', 'superfamily', 'superCDDID', 3,4,5, 6]]
rps.to_csv(os.path.join(RESULTS, DATASET, 'cdd-rps.tsv.gz'), header=None, sep='\t', index=None, compression='gzip')
rps.head()




In [None]:
df = rps[[0,'CDDID', 'superCDDID', 4, 6]]
df = df.sort_values(by=[0,'superCDDID',6])
df = df.drop_duplicates(subset=[0,'CDDID','superCDDID'], keep='last')
df.to_csv(os.path.join(RESULTS, DATASET, 'cdd.tsv.gz'), header=None, sep='\t', index=None, compression='gzip')


In [None]:
df1 = rps[[0,'family']].drop_duplicates()
df1 = df1[df1['family'].str.startswith('cd')]
print('{}/{}'.format(len(df1), len(df)))

print('\n{} pair transcript CDD ID'.format(len(df1)))
print('{} transcripts CDD ID'.format(len(df1[0].unique())))
display(df1.head())

df4 = df1.groupby(0).count()
df4 = df4.reset_index()
df5 = df4.merge(trans_len, on=0)
df5 = df5.rename(columns={0:'Transcript', 1:'Length'})
df5 = df5.sort_values('Length')
df5 = df5[df5['Length'] <= 4500]
display(df5.head())

fig = plt.figure(figsize=(16,10), constrained_layout=True)

gs = GridSpec(2, 2, figure=fig)

ax0 = fig.add_subplot(gs[0, :])

ax0.bar(df5.Length, height=df5.family)
ax0.set_title('Barplot transcript length vs no. CDD Families');
ax0.set_ylabel("No of CDD ID")
ax0.set_xlabel("Transcript Length")

ax1 = fig.add_subplot(gs[1, 0])

n, bins, patches = ax1.hist(df5.family, 100, facecolor='blue', alpha=0.5)
ax1.set_xlabel('No of CDD Families')
ax1.set_ylabel('No of Transcripts')
ax1.set_title('Histogram of CDD ID')

ax2 = fig.add_subplot(gs[1, 1])
n, bins, patches = ax2.hist(df5.Length, 100, facecolor='blue', alpha=0.5)
ax2.set_xlabel('Transcript Length')
ax2.set_ylabel('')
ax2.set_title('Histogram of Transcript Length')

In [None]:
df1 = rps[[0,'superfamily']].drop_duplicates()
df1 = df1[df1['superfamily'].str.startswith('cl')]
print('{}/{}'.format(len(df1), len(df)))

print('\n{} pair transcript CDD ID'.format(len(df1)))
print('{} transcripts CDD ID'.format(len(df1[0].unique())))
display(df1.head())

df4 = df1.groupby(0).count()
df4 = df4.reset_index()
df5 = df4.merge(trans_len, on=0)
df5 = df5.rename(columns={0:'Transcript', 1:'Length'})
df5 = df5.sort_values('Length')
df5 = df5[df5['Length'] <= 4500]
display(df5.head())

fig = plt.figure(figsize=(16,10), constrained_layout=True)

gs = GridSpec(2, 2, figure=fig)

ax0 = fig.add_subplot(gs[0, :])

ax0.bar(df5.Length, height=df5.superfamily)
ax0.set_title('Barplot transcript length vs no. CDD Superfamilies');
ax0.set_ylabel("No of CDD ID")
ax0.set_xlabel("Transcript Length")

ax1 = fig.add_subplot(gs[1, 0])

n, bins, patches = ax1.hist(df5.superfamily, 100, facecolor='blue', alpha=0.5)
ax1.set_xlabel('No of CDD Superfamilies')
ax1.set_ylabel('No of Transcripts')
ax1.set_title('Histogram of CDD ID')

ax2 = fig.add_subplot(gs[1, 1])
n, bins, patches = ax2.hist(df5.Length, 100, facecolor='blue', alpha=0.5)
ax2.set_xlabel('Transcript Length')
ax2.set_ylabel('')
ax2.set_title('Histogram of Transcript Length')
