In [1]:
import requests as rq
import pandas as pd 
import numpy as np 

## Creating the Psicube dataframe

In [2]:
url = 'http://pseudogene.org/psicube/data/psidr/psiDR-HUMAN-April2014.txt' 
r = rq.get(url) #request to get the url
open('pseudogene.txt', 'wb').write(r.content) #download url in current directory, save as "pseudogene.txt"
df = pd.read_csv('pseudogene.txt', sep = '\s+', engine='python', header=[16]) 

## Parsing Psicube for the resurrected pseudogenes

In [3]:
target = df.loc[df['Activity_features'] == 'Tnx_1:Pol2_1:AC_1:TF_1'].copy() #target values are the ones with the specfied activity feature
target.to_csv('targetdata.txt', index=None, sep='\t')
target['Parent_transcript'] = target['Parent_transcript'].astype(str).str[:-2]

## Mapping the Pseudo Pipe ID and Protein Families

In [4]:
ppipe = pd.read_csv('PPipe.txt', sep='\s+', engine='python', header=[0]) #information with the pseudopipe ID
pfam = pd.read_csv('martquery_0618213934_745.txt', header=[0])
ppipe = ppipe.rename(columns={"EnsemblID.version": "#Pseudogene_id"}) #changes Ensemble id version column to pseudogene id
pfam = pfam.rename(columns={'Transcript stable ID' : 'Parent_transcript'}) #Changes Transcript ID to Parent Transcript
combined = pd.merge(target, ppipe, how='left', on="#Pseudogene_id")
combined = pd.merge(combined, pfam, on ='Parent_transcript', how='left') 
combined = combined.replace(np.nan, 'N/A', regex=True) #replaces NaN with N/A
combined = combined.drop_duplicates()
combined.to_csv('combined_data.txt', index=None, sep='\t') 
combined


Unnamed: 0,#Pseudogene_id,Biotype,Parent_gene,Parent_transcript,Activity_features,PseudoPipeID,EnsemblID,Gene stable ID,Gene stable ID version,Transcript stable ID version,Pfam ID,Protein stable ID,Protein stable ID version
0,ENST00000359901.3,DUP,.,,Tnx_1:Pol2_1:AC_1:TF_1,,,,,,,,
1,ENST00000406724.1,DUP,.,,Tnx_1:Pol2_1:AC_1:TF_1,,,,,,,,
2,ENST00000404404.1,DUP,.,,Tnx_1:Pol2_1:AC_1:TF_1,,,,,,,,
3,ENST00000453917.1,PSSD,.,,Tnx_1:Pol2_1:AC_1:TF_1,,,,,,,,
4,ENST00000448968.2,PSSD,.,,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000291220,ENST00000448968,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,ENST00000492994.1,PSSD,ENSG00000058262.5,ENST00000243253,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000302420,ENST00000492994,ENSG00000058262,ENSG00000058262.10,ENST00000243253.8,PF10559,ENSP00000243253,ENSP00000243253.3
137,ENST00000492994.1,PSSD,ENSG00000058262.5,ENST00000243253,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000302420,ENST00000492994,ENSG00000058262,ENSG00000058262.10,ENST00000243253.8,PF00344,ENSP00000243253,ENSP00000243253.3
138,ENST00000431295.1,PSSD,ENSG00000109475.12,ENST00000394668,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000296581,ENST00000431295,ENSG00000109475,ENSG00000109475.16,ENST00000394668.2,PF01199,ENSP00000378163,ENSP00000378163.2
139,ENST00000376334.3,DUP,ENSG00000109536.6,ENST00000226798,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000303876,ENST00000376334,ENSG00000109536,ENSG00000109536.12,ENST00000226798.9,PF06229,ENSP00000226798,ENSP00000226798.4


## Gathering the Protein Families

In [5]:
dups_family =combined.pivot_table(index=['Pfam ID'], aggfunc='size')
dups_family.to_csv('dups_family.txt', sep=',')

## Finding rows with same protein families and parents

In [6]:
dup_list = combined[combined.duplicated(['Protein stable ID', 'Pfam ID'], keep=False)]
dup_list = dup_list.sort_values(['Pfam ID'], ascending=False)
dup_list.to_csv('dup_list.txt', sep='\t')
dup_list

Unnamed: 0,#Pseudogene_id,Biotype,Parent_gene,Parent_transcript,Activity_features,PseudoPipeID,EnsemblID,Gene stable ID,Gene stable ID version,Transcript stable ID version,Pfam ID,Protein stable ID,Protein stable ID version
18,ENST00000465689.1,DUP,ENSG00000171551.7,ENST00000304546,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000298351,ENST00000465689,ENSG00000171551,ENSG00000171551.12,ENST00000304546.6,PF05649,ENSP00000302051,ENSP00000302051.1
116,ENST00000373592.2,DUP,ENSG00000171551.7,ENST00000304546,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000298352,ENST00000373592,ENSG00000171551,ENSG00000171551.12,ENST00000304546.6,PF05649,ENSP00000302051,ENSP00000302051.1
129,ENST00000439199.1,PSSD,ENSG00000140988.8,ENST00000343262,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000302552,ENST00000439199,ENSG00000140988,ENSG00000140988.16,ENST00000343262.9,PF03719,ENSP00000341885,ENSP00000341885.4
49,ENST00000456516.1,PSSD,ENSG00000140988.8,ENST00000343262,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000289968,ENST00000456516,ENSG00000140988,ENSG00000140988.16,ENST00000343262.9,PF03719,ENSP00000341885,ENSP00000341885.4
36,ENST00000438210.1,PSSD,ENSG00000140988.8,ENST00000343262,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000296292,ENST00000438210,ENSG00000140988,ENSG00000140988.16,ENST00000343262.9,PF03719,ENSP00000341885,ENSP00000341885.4
115,ENST00000373592.2,DUP,ENSG00000171551.7,ENST00000304546,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000298352,ENST00000373592,ENSG00000171551,ENSG00000171551.12,ENST00000304546.6,PF01431,ENSP00000302051,ENSP00000302051.1
17,ENST00000465689.1,DUP,ENSG00000171551.7,ENST00000304546,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000298351,ENST00000465689,ENSG00000171551,ENSG00000171551.12,ENST00000304546.6,PF01431,ENSP00000302051,ENSP00000302051.1
128,ENST00000439199.1,PSSD,ENSG00000140988.8,ENST00000343262,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000302552,ENST00000439199,ENSG00000140988,ENSG00000140988.16,ENST00000343262.9,PF00333,ENSP00000341885,ENSP00000341885.4
35,ENST00000438210.1,PSSD,ENSG00000140988.8,ENST00000343262,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000296292,ENST00000438210,ENSG00000140988,ENSG00000140988.16,ENST00000343262.9,PF00333,ENSP00000341885,ENSP00000341885.4
48,ENST00000456516.1,PSSD,ENSG00000140988.8,ENST00000343262,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000289968,ENST00000456516,ENSG00000140988,ENSG00000140988.16,ENST00000343262.9,PF00333,ENSP00000341885,ENSP00000341885.4
