<a href="https://colab.research.google.com/github/pilatova/prot-viz-data/blob/test/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/pilatova/prot-viz-data/test/dist2.tsv"

dist = pd.read_csv(
    url,
    sep='\t',           # Tab separator for TSV files
    header=0,           # Use the first row as headers
    index_col=0         # Set the first column ("protein_id") as the index
)

dist.head()

Unnamed: 0_level_0,human-baboon,human-cow,human-dog,human-mouse,human-opossum,baboon-cow,baboon-dog,baboon-mouse,baboon-opossum,cow-dog,cow-mouse,cow-opossum,dog-mouse,dog-opossum,mouse-opossum
protein_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
A0A087WTH1,0.029031,0.154998,0.110481,0.196955,0.344118,0.144013,0.109375,0.232052,0.385447,0.138486,0.217262,0.389459,0.192162,0.368851,0.437878
A0A087WZ39,0.10367,0.377518,0.391924,0.578767,0.941801,0.453725,0.498146,0.613333,1.076725,0.40662,0.47447,0.844741,0.525629,0.926316,1.071503
A0A096LP49,0.160867,0.640212,0.633581,0.685461,2.314668,0.66786,0.642229,0.718148,2.22659,0.552318,0.822833,2.328829,0.77594,2.277628,2.473843
A0A0U1RRA0,0.016778,0.143482,0.109374,0.167911,0.5445,0.162434,0.128345,0.148527,0.536199,0.088828,0.183459,0.661912,0.089641,0.556196,0.551611
A0A126GWI2,0.061848,0.20957,0.176613,0.199606,0.276475,0.206727,0.171825,0.195499,0.269331,0.16194,0.212899,0.259762,0.185132,0.232008,0.252956


In [3]:
# ----------------------
# Basic Data Checks
# ----------------------

# 1. Check for missing values
print("Missing values per column:")
display(dist.isnull().sum())

Missing values per column:


Unnamed: 0,0
human-baboon,0
human-cow,0
human-dog,0
human-mouse,0
human-opossum,0
baboon-cow,0
baboon-dog,0
baboon-mouse,0
baboon-opossum,0
cow-dog,0


In [5]:
# 2. Check data types
print("\nData types:")
display(dist.dtypes)


Data types:


Unnamed: 0,0
human-baboon,float64
human-cow,float64
human-dog,float64
human-mouse,float64
human-opossum,float64
baboon-cow,float64
baboon-dog,float64
baboon-mouse,float64
baboon-opossum,float64
cow-dog,float64


In [7]:
# 3. Check numeric ranges (all values should be ≥0)
print("\nValue ranges:")
display(dist.describe())

# 4. Check for negative values (invalid for evolutionary distances)
negative_values = (dist < 0).any().any()
print(f"\nNegative values present? {negative_values}")
print(f"\nRows containing negative values:")
display(dist[(dist < 0).any(axis=1)])

# 5. Check index uniqueness (protein IDs should be unique)
print(f"\nUnique protein IDs? {dist.index.is_unique}")

# 6. Check row/column counts
print(f"\nData shape: {dist.shape} (rows x columns)")


Value ranges:


Unnamed: 0,human-baboon,human-cow,human-dog,human-mouse,human-opossum,baboon-cow,baboon-dog,baboon-mouse,baboon-opossum,cow-dog,cow-mouse,cow-opossum,dog-mouse,dog-opossum,mouse-opossum
count,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0,13021.0
mean,0.038382,0.129694,0.123894,0.157894,0.285598,0.14157,0.136912,0.169103,0.298921,0.128231,0.181213,0.299111,0.177729,0.298082,0.317772
std,0.069838,0.129373,0.126826,0.151289,0.255533,0.147768,0.145471,0.165949,0.270132,0.137583,0.17514,0.270242,0.171626,0.269467,0.283351
min,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,-1.0,1e-05
25%,0.007703,0.039579,0.035248,0.050342,0.106073,0.042359,0.03808,0.053805,0.110401,0.036623,0.060014,0.112332,0.057303,0.109874,0.119185
50%,0.021102,0.093793,0.086754,0.115105,0.217207,0.100132,0.094019,0.123021,0.226567,0.089095,0.132659,0.226829,0.128338,0.225141,0.240641
75%,0.044141,0.178643,0.170895,0.216196,0.38858,0.191181,0.18544,0.230195,0.40465,0.173883,0.246104,0.404839,0.24341,0.403087,0.428674
max,1.797991,2.155932,1.328868,1.683248,2.782719,2.441345,2.040214,3.030621,3.452648,2.477352,2.73527,3.005446,1.830683,3.079467,3.075986



Negative values present? True

Rows containing negative values:


Unnamed: 0_level_0,human-baboon,human-cow,human-dog,human-mouse,human-opossum,baboon-cow,baboon-dog,baboon-mouse,baboon-opossum,cow-dog,cow-mouse,cow-opossum,dog-mouse,dog-opossum,mouse-opossum
protein_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Q8IYW2,0.189938,0.426894,0.205668,0.475243,0.675513,0.743782,0.190153,0.451264,1.776299,0.451167,0.529828,0.739962,0.263684,-1.0,0.727156



Unique protein IDs? True

Data shape: (13021, 15) (rows x columns)


In [8]:
# Check for unreasonably large distances (e.g., >10)
max_value = dist.max().max()
print(f"Maximum distance value: {max_value}")

# Check for zeros (if they should not exist between species)
zero_values = (dist == 0).any().any()
print(f"Zero values present? {zero_values}")

Maximum distance value: 3.452648
Zero values present? False
