In [1]:
import os
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from sklearn.linear_model import LogisticRegression

In [2]:
# returns the row/cell of column 'target_col' where for that row in column 'search_col' the value is 'row_val' 
def fval(self, search_col, row_val, target_col = None):
    if target_col == None:
        target_col = slice(None)
    if row_val == None:
        return self.loc[self.loc[:, search_col].isnull(), target_col]
    return self.loc[self.loc[:, search_col] == row_val, target_col]

# filters the rows that have na values at column 'col_name'
def filtna(self, col_name):
    return self[~self.loc[:,col_name].isnull()]

# for a specific column with 'col_name' returns the number of matched items between two dataframes + reports it
def fmatch(self, other_df, col_name):
    matched = self.loc[:, col_name].isin(other_df.loc[:,col_name])
    print('The two dataframes have', matched.sum(), 'common elements for column', col_name)
    return matched.sum()

pd.DataFrame.fval = fval
pd.DataFrame.filtna = filtna
pd.DataFrame.fmatch = fmatch

In [3]:
# Change working directory
os.chdir('/sc/arion/projects/va-biobank/PROJECTS/ma_cdr_psychAD/')
os.chdir('/sc/arion/projects/va-biobank/PROJECTS/ma_cdr_psychAD/')

In [4]:
# Find all paths for dataframes that contain 'all_compound' in their name
#path = Path('/sc/arion/projects/va-biobank/PROJECTS/marios_temp/parallel_antagonism/working.directories/psychAD_V3/results/GTP_CDR/wilcoxRank/results')
path = Path('/sc/arion/projects/va-biobank/PROJECTS/marios_temp/parallel_antagonism/working.directories/psychAD_V4/results/GTP_CDR/wilcoxRank/results')

file_paths = list(path.rglob('*all_compound*'))

In [5]:
## MERGE CDR RESULTS IN A SINGLE DATAFRAME

# Load columns 'pert_iname' and 'Compound.pseudo.zscore' of the first dataframe (we will use this as template)
df_main = pd.read_csv(file_paths[0], usecols=['pert_iname', 'Compound.pseudo.zscore'])

# Rename the column to retain individual ID info
df_main = df_main.rename(columns={'Compound.pseudo.zscore':file_paths[0].parent.parent.name})

# Iterate over list of paths
for f in file_paths[1:]:
    # Load the two columns of the new df
    df_next = pd.read_csv(f, usecols=['pert_iname', 'Compound.pseudo.zscore'])
    # Rename the pseudo.zscore column to retain the ind ID
    df_next = df_next.rename(columns={'Compound.pseudo.zscore':f.parent.parent.name})
    # left_join
    df_main = df_main.merge(df_next, on = 'pert_iname', how = 'left')

# pert_iname will be the temporary index column
df_main = df_main.rename(columns={'pert_iname':'index'})

#  .set_index() converts the rows of a column to "rownames"
df_main = df_main.set_index('index').T



In [15]:
df_main = df_backup

In [16]:
# Identify individuals with NA values 
rows_with_na = df_main.isna().any(axis=1)
individuals_with_na = df_main[rows_with_na].index
[print(f'Ind. {thisind} has {df_main.loc[thisind,:].isna().sum()} NAs in total') for thisind in individuals_with_na]

print(f'In total {df_main.shape[0] - df_main.dropna(axis=0).shape[0]} individuals have NA values and will be removed.')

Ind. 121638 has 18 NAs in total
Ind. 7_200876190005_R03C01 has 6 NAs in total
Ind. 35987 has 39 NAs in total
Ind. 175364 has 1 NAs in total
Ind. 137045 has 2 NAs in total
Ind. 34454 has 19 NAs in total
Ind. 34724 has 4 NAs in total
Ind. 19468 has 26 NAs in total
Ind. 35515 has 7 NAs in total
Ind. 173400 has 1 NAs in total
Ind. 115838 has 11 NAs in total
Ind. 207 has 13 NAs in total
Ind. 4_202429290026_R04C01 has 14 NAs in total
In total 13 individuals have NA values and will be removed.


In [17]:
# Remove NA values
df_main = df_main.dropna(axis=0)

# Reset index and rename columns
df_main = df_main.reset_index()
df_main = df_main.rename(columns={'index':'ind_ID'})

In [18]:
## PREPARE psychAD DATA
df = pd.read_csv('./Resources/psychAD/clinical_metadata.csv')
df = df[(df['Ethnicity'] == 'EUR')]
df = df.rename(columns={'SubID':'ind_ID_clin'})
df_psychAD = df
df_psychAD.iloc[:5,:5]


Unnamed: 0,ind_ID_clin,Brain_bank,Age,Sex,Sex_chr_aneuploidy
0,M3284,MSSM,93.0,Female,
1,M12846,MSSM,83.0,Female,
2,M16747,MSSM,74.0,Male,
3,M28611,MSSM,79.0,Male,
4,M28710,MSSM,82.0,Female,


In [22]:
# Map the patient IDs of CDR results to the psychAD clinical metadata

cdr_results = df_main #for clarity

# Load the ID mappings dataframe
mypath = '/sc/arion/projects/roussp01a/deepika/merging_psychAD_SNParray_WGS/common_variants_psychAD/ancestry_pca_psychAD_1429_samples/psychAD_20PC_3_methods_ancestry.tsv'

# idmap contains mappings of IDs
idmap = pd.read_csv(mypath, sep='\t')

print(cdr_results.iloc[:5,:3]) # ind_ID has patient ID in 

print(idmap.iloc[:5,:7]) # column IID matches cdr results, column SubID matches psychAD

#cdr_results.loc[:,'ind_ID']


index                         ind_ID  BRD-A56332531  stemregenin-1
0      G-MSBB-MB000257-BR-MSBB-71912       1.840652       1.796896
1                8_3999338075_R04C01       1.268373       1.293705
2                             112738       1.124536       1.335197
3                             214639       0.892648       1.099458
4                             140650       0.944899       0.997724
          IID  mod_IIDs   SubID       PC1       PC2       PC3       PC4
0  0_MSSM_109  MSSM_109  M87879 -0.014733  0.037377  0.007169 -0.000278
1  0_MSSM_112  MSSM_112  M79245 -0.016455 -0.025708 -0.016513 -0.011320
2  0_MSSM_115  MSSM_115  M99132 -0.014894  0.039530  0.007221  0.001165
3   0_MSSM_12   MSSM_12  M36086 -0.014633  0.040327  0.006924 -0.000166
4  0_MSSM_129  MSSM_129  M12876 -0.000705 -0.013395  0.013946  0.026244


In [23]:
cdr_id = {}
for key in cdr_results.loc[:, 'ind_ID']:
    # extract subid from idmap (contains the mappings across various ID formats)
    subid = idmap.loc[idmap.iloc[:,0] == key,'SubID'].values[0]
    # create new column in cdr_results that will match the subid
    cdr_results.loc[cdr_results.loc[:,'ind_ID']==key, 'ind_ID_clin'] = subid

# df.insert(index, 'colname', series.obj) ; adds in the indexed position, a column with 'colname' that is populated with 'series.obj'
# df.pop('colname') return a series obj from column with name 'colname'
cdr_results.insert(1, 'ind_ID_clin', cdr_results.pop('ind_ID_clin'))
cdr_results.iloc[:5,:5]

index,ind_ID,ind_ID_clin,BRD-A56332531,stemregenin-1,moxaverine
0,G-MSBB-MB000257-BR-MSBB-71912,M97594,1.840652,1.796896,1.720447
1,8_3999338075_R04C01,H2573,1.268373,1.293705,1.20297
2,112738,M54382,1.124536,1.335197,1.803811
3,214639,M11716,0.892648,1.099458,1.607863
4,140650,M64012,0.944899,0.997724,1.255344


In [56]:
cdr_results.shape[0]

981

In [24]:
## Merge dataframes
df_merged = cdr_results.merge(df_psychAD, on = 'ind_ID_clin', how = 'left')
print('Final df_merged.shape[0] is', df_merged.shape[0])

## Ensure successuful merging
print(cdr_results.shape, df_psychAD.shape, df_merged.shape)

print('Null values for df_psychAD.loc[:, "Dementia"] :' , df_psychAD.loc[:, 'Dementia'].isnull().sum())
print('Null values for df_merged.loc[:, "Dementia"] :' , df_merged.loc[:,'Dementia'].isnull().sum())
print('Null values for df_merged.loc[:, "roxithromycin"] :' , df_main.loc[:,'roxithromycin'].isnull().sum())

# returns for a specific column with 'col_name', the number of matched items between two dataframes + reports it
#df_merged.fmatch(df_psychAD, 'ind_ID_clin')

# filters out rows which have na values at column dementia
#df_merged = df_merged.filtna('Dementia')


print('After filtering Null values for df_psychAD.loc[:, "Dementia"] :' , df_psychAD.loc[:, 'Dementia'].isnull().sum())
print('After filtering Null values for df_merged.loc[:, "Dementia"] :' , df_merged.loc[:,'Dementia'].isnull().sum())
print('After filtering Null values for df_merged.loc[:, "roxithromycin"] :' , df_main.loc[:,'roxithromycin'].isnull().sum())

print('Final df_merged.shape[0] is', df_merged.shape[0])


Final df_merged.shape[0] is 981
(981, 4129) (1193, 384) (981, 4512)
Null values for df_psychAD.loc[:, "Dementia"] : 211
Null values for df_merged.loc[:, "Dementia"] : 183
Null values for df_merged.loc[:, "roxithromycin"] : 0
After filtering Null values for df_psychAD.loc[:, "Dementia"] : 211
After filtering Null values for df_merged.loc[:, "Dementia"] : 183
After filtering Null values for df_merged.loc[:, "roxithromycin"] : 0
Final df_merged.shape[0] is 981


In [58]:
path_to_export = '/sc/arion/projects/va-biobank/PROJECTS/ma_cdr_psychAD/Resources/result_analysis'
file_name = 'cdr_psychAD_IC_microglia_v4.csv'
df_merged.to_csv(os.path.join(path_to_export, file_name), index=False)

In [38]:
df_psychAD[df_psychAD.loc[:,'ind_ID'].isin(df_main.loc[:,'ind_ID'])]

Unnamed: 0,ind_ID,Brain_bank,Age,Sex,Sex_chr_aneuploidy,Ethnicity,Dx,pH,PMI,Source_Location,...,LewyDorsalVValue,CDR_Memory,CDR_Orientation,CDR_Judgement,CDR_Community,CDR_HomeHobbies,CDR_PersonalCare,CDR_SumBoxes,Cognitive_Resilience,Cognitive_and_Tau_Resilience


In [37]:
import pandas as pd

# Example Series
series1 = pd.Series(['apple', 'banana', 'cherry', 'date'])
series2 = pd.Series(['banana', 'cherry', 'fig', 'grape'])

# Check for overlapping strings
overlap = series1[series1.isin(series2)]

print("Overlapping strings:")
print(overlap)


Overlapping strings:
1    banana
2    cherry
dtype: object


In [4]:

from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)
clf.predict(X[:2, :])

clf.predict_proba(X[:2, :])
clf.score(X, y)a

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9733333333333334

In [33]:
## Inspect variables
df = pd.read_csv('./Resources/psychAD/clinical_metadata_full.csv')
df.shape

  df = pd.read_csv('./Resources/psychAD/clinical_metadata_full.csv')


(1494, 889)