In [None]:
from sqlalchemy import create_engine
import pandas as pd
# Open database connection
engine = create_engine("mysql+pymysql://tcrd@tcrd.kmc.io:3306/tcrd540") # 2018 version

In [None]:
dfprotein = pd.read_sql_query("""
  select 
  protein.id,
  protein.uniprot,
  protein.name,
  protein.sym,
  target.tdl
from
  protein
  join t2tc on protein.id=t2tc.protein_id
  join target on t2tc.target_id=target.id
order by
  protein.id
 """, engine)

In [None]:
engine = create_engine("mysql+pymysql://tcrd@tcrd.kmc.io:3306/tcrd6134pharos2") # latest version
dfppi = pd.read_sql_query("""
select
  ncats_ppi.protein_id,
  ncats_ppi.ppitypes,
  ncats_ppi.score,
  ncats_ppi.other_id
from
ncats_ppi
  join t2tc on ncats_ppi.protein_id=t2tc.protein_id
WHERE ppitypes = 'STRINGDB'
order by
  protein_id

""", engine)
dfprotein2020 = pd.read_sql_query("""
  select 
  protein.id,
  protein.uniprot,
  protein.name,
  protein.sym,
  target.tdl
from
  protein
  join t2tc on protein.id=t2tc.protein_id
  join target on t2tc.target_id=target.id
order by
  protein.id
 """, engine)
dfprotein2020

In [None]:
test = pd.read_feather('v13data.feather') # data for _ALL models
# Get the column names excluding the first five columns
columns_to_process = test.columns[5:]
# Replace NaN values with 0 in columns except the first five
test[columns_to_process] = test[columns_to_process].fillna(0)
svmdf = pd.read_csv("1svm_nontclins.csv")
nontclins = test.loc[test["uniprot"].isin(svmdf["Intersection"])]

In [None]:
df = nontclins
# Group columns based on their prefixes
ach_cols = [col for col in nontclins.columns if col.startswith('ACH')]
lincs_cols = [col for col in nontclins.columns if col.startswith('LINCS')]
gtex_cols = [col for col in nontclins.columns if col.startswith('GTEX')]
mim_cols = [col for col in nontclins.columns if col.startswith('MIM')]

# Count the number of columns in each category
ach_col_count = len(ach_cols)
lincs_col_count = len(lincs_cols)
gtex_col_count = len(gtex_cols)
mim_col_count = len(mim_cols)

print("Total no. of proteins predicted to be non-Tclins by all the 1SVM models = 1479")
print("\nTotal no. of proteins predicted to be non-Tclins by all the 1SVM models and tracked back to 2018 Pharos= 1290")

print("\nNumber of features in each feature category for the 1290:")
print(f"ACH: {ach_col_count}")
print(f"LINCS: {lincs_col_count}")
print(f"GTEX: {gtex_col_count}")
print(f"MIM: {mim_col_count}")

# Calculate sum for each category of columns
ach_sum = nontclins[ach_cols].sum()
lincs_sum = nontclins[lincs_cols].sum()
gtex_sum = nontclins[gtex_cols].sum()
mim_sum = nontclins[mim_cols].sum()

# Count number of columns with sum equal to zero in each category
ach_zero_sum_count = (ach_sum == 0).sum()
lincs_zero_sum_count = (lincs_sum == 0).sum()
gtex_zero_sum_count = (gtex_sum == 0).sum()
mim_zero_sum_count = (mim_sum == 0).sum()

print("\nNumber of features with sum equal to zero in each feature category for the 1290:")
print(f"CCLE: {ach_zero_sum_count}")
print(f"LINCS: {lincs_zero_sum_count}")
print(f"GTEX: {gtex_zero_sum_count}")
print(f"DISEASES: {mim_zero_sum_count}")

In [None]:
nontclins[["sym","uniprot", "tdl", "protein_id"]].to_csv("nontclin1290.csv", index=False)