In [10]:
import requests as rq
import pandas as pd 
import numpy as np 
from functools import reduce

## Creating the Psicube dataframe

In [11]:
url = 'http://pseudogene.org/psicube/data/psidr/psiDR-HUMAN-April2014.txt' 
r = rq.get(url) #request to get the url
open('pseudogene.txt', 'wb').write(r.content) #download url in current directory, save as "pseudogene.txt"
df = pd.read_csv('pseudogene.txt', sep = '\s+', engine='python', header=[16]) 

## Parsing Psicube for the resurrected pseudogenes

In [12]:
target = df.loc[df['Activity_features'] == 'Tnx_1:Pol2_1:AC_1:TF_1'].copy() #target values are the ones with the specfied activity feature
target.to_csv('targetdata.txt', index=None, sep='\t') 

## Adding Pseudo Pipe ID and Protein Family columns

In [14]:
ppipe = pd.read_csv('PPipe.txt', sep='\s+', engine='python', header=[0]) #information with the pseudopipe ID
ppipe = ppipe.rename(columns={"EnsemblID.version": "#Pseudogene_id"}) #changes Ensemble id version column to pseudogene id
pfamily = pd.read_csv('9606_pseudogenes.txt', sep='\s+', engine='python', header=[0]) #creates df of ids and their families
pfamily = pfamily.rename(columns={"id" : "PseudoPipeID"})
combined = pd.merge(target, ppipe, how='left', on="#Pseudogene_id") #merges the two dataframes
combined = pd.merge(combined, pfamily, on='PseudoPipeID', how='left') #merging of the combined and pfamily dfs
combined = combined.drop(['EnsemblID'], axis=1)
combined.to_csv('combined_data.txt', index=None, sep='\t') 
combined = combined.replace(np.nan, 'N/A', regex=True) #replaces NaN with N/A
combined.head()

Unnamed: 0,#Pseudogene_id,Biotype,Parent_gene,Parent_transcript,Activity_features,PseudoPipeID,chr,start,end,strand,parent,fraction,identity,e-value,class,families
0,ENST00000359901.3,DUP,.,.,Tnx_1:Pol2_1:AC_1:TF_1,,,,,,,,,,,
1,ENST00000406724.1,DUP,.,.,Tnx_1:Pol2_1:AC_1:TF_1,,,,,,,,,,,
2,ENST00000404404.1,DUP,.,.,Tnx_1:Pol2_1:AC_1:TF_1,,,,,,,,,,,
3,ENST00000453917.1,PSSD,.,.,Tnx_1:Pol2_1:AC_1:TF_1,,,,,,,,,,,
4,ENST00000448968.2,PSSD,.,.,Tnx_1:Pol2_1:AC_1:TF_1,PGOHUM00000291220,,,,,,,,,,
