In [1]:
import pyreadr
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Define paths:

In [23]:
# this is the dataframe resulting from the analysis of individual classification trends:
path_classif_df = 'C:/Users/SG/Sex-bias-cell-type-classification-scRNAseq/3. HLCA core - analysis of classification results/Analysis_results/df_classification_trend.pickle'

# this is the dictionary of cell type info resulting from the exploration + DA analysis
path_cell_type_info = 'C:/Users/SG/Sex-bias-cell-type-classification-scRNAseq/1. HLCA core - exploration/DA_results/cell_type_info_with_DA.pickle'

#### Import data

Import the DS results:

In [19]:
result = pyreadr.read_r("final_DS_results.rds")
df_DS = result[None]
df_DS

Unnamed: 0_level_0,#DS,%DS
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1
Basal resting,142.0,0.71
Multiciliated (non-nasal),42.0,0.208
Alveolar macrophages,33.0,0.164
EC arterial,29.0,0.154
Suprabasal,19.0,0.094
...,...,...
Lymphatic EC proliferating,,
Lymphatic EC differentiating,,
Subpleural fibroblasts,,
Mesothelium,,


Import the misclassification results:

In [13]:
with open(path_classif_df, 'rb') as file:
    df_classification_trend = pickle.load(file)
df_classification_trend.sort_values('Classification trend')

Unnamed: 0,Cell type,Slope test,Flip test,Maximum performance difference,Slope on female set,Slope on male set,Classification trend
0,AT0,True,True,0.311030,0.154401,-0.107692,Distinct
44,Non-classical monocytes,True,True,0.181524,0.020268,-0.150805,Distinct
41,Myofibroblasts,True,True,0.223347,0.244085,-0.077075,Distinct
39,Multiciliated (nasal),True,True,0.361397,0.250057,-0.161455,Distinct
38,Monocyte-derived Mph,True,True,0.258688,0.149452,-0.169069,Distinct
...,...,...,...,...,...,...,...
8,Alveolar fibroblasts,False,True,0.151723,0.145536,0.012718,Non-distinct
6,Alveolar Mph MT-positive,False,False,0.420814,0.596874,0.867133,Non-distinct
4,Adventitial fibroblasts,False,True,0.084243,-0.034711,-0.100552,Non-distinct
37,Migratory DCs,False,False,0.145963,-0.058442,-0.023715,Non-distinct


#### Combine results

In [20]:
for i in df_DS.index:
    if i in list(df_classification_trend["Cell type"]):
        df_DS.loc[i, "Classification trend"] = df_classification_trend.loc[df_classification_trend['Cell type'] == i,'Classification trend'].iloc[0]
    else:
        print(f'{i} not in misclassification analysis')

# round %DS for readability
def round_numeric(val, decimals=4):
    try:
        return round(float(val), decimals)
    except (ValueError, TypeError):
        return val
df_DS['%DS'] = df_DS['%DS'].apply(round_numeric)

# replace NaNs
df_DS.replace('NaN', '-', inplace=True)
df_DS.replace(float('nan'), '-', inplace=True)

# reformat table
df_DS.reset_index(inplace = True)
df_DS.rename(columns={'rownames': 'Cell type'}, inplace=True)
df_DS = df_DS[['Cell type', '#DS', '%DS', 'Classification trend']]

# make #DS an int for readability
def toint_numeric(val):
    try:
        return int(val)
    except (ValueError, TypeError):
        return val
df_DS['#DS'] = df_DS['#DS'].apply(toint_numeric)

# show table
df_DS.head(60)

Unnamed: 0,Cell type,#DS,%DS,Classification trend
0,Basal resting,142,0.71,Inconclusive
1,Multiciliated (non-nasal),42,0.208,Inconclusive
2,Alveolar macrophages,33,0.164,Distinct
3,EC arterial,29,0.154,Distinct
4,Suprabasal,19,0.094,Distinct
5,AT2,15,0.0744,Inconclusive
6,Goblet (nasal),12,0.0601,Distinct
7,Club (nasal),11,0.0557,Distinct
8,EC venous pulmonary,10,0.0536,Distinct
9,Goblet (bronchial),9,0.0482,Distinct


#### Rank sum test to check if distinct cell types are more DE

In [22]:
from scipy.stats import mannwhitneyu

df_totest = df_DS[(df_DS["#DS"] != '-') & (df_DS["Classification trend"] != 'Inconclusive')]
df_totest['%DS'] = pd.to_numeric(df_totest['%DS'])

distinct = df_totest.loc[df_totest["Classification trend"] == 'Distinct','%DS']
nondistinct = df_totest.loc[df_totest["Classification trend"] == 'Non-distinct','%DS']

# perform the Mann-Whitney U test
stat, p_value = mannwhitneyu(distinct, nondistinct, alternative='greater')
stat, p_value

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_totest['%DS'] = pd.to_numeric(df_totest['%DS'])


(259.0, 0.007355214506687283)

p-value of 0.00735 --> significant correlation

#### Add data on sex ratio & DA

Import additional information:

In [24]:
# import data on sample counts, donor counts etc
with open(path_cell_type_info, 'rb') as file:
    cell_type_info = pickle.load(file)
cell_type_info

{'Basal resting': {'total_samples': 38955,
  'female_prop': 0.3647028622769863,
  'num_donors': 77,
  'num_female_donors': 33,
  'num_DA_nhoods': 1546,
  'prop_DA_nhoods': 0.58},
 'Suprabasal': {'total_samples': 41158,
  'female_prop': 0.4482482141989407,
  'num_donors': 77,
  'num_female_donors': 32,
  'num_DA_nhoods': 1906,
  'prop_DA_nhoods': 0.67},
 'Hillock-like': {'total_samples': 4600,
  'female_prop': 0.538695652173913,
  'num_donors': 38,
  'num_female_donors': 15,
  'num_DA_nhoods': 202,
  'prop_DA_nhoods': 0.65},
 'Deuterosomal': {'total_samples': 1004,
  'female_prop': 0.4312749003984064,
  'num_donors': 70,
  'num_female_donors': 28,
  'num_DA_nhoods': 1,
  'prop_DA_nhoods': 0.01},
 'Multiciliated (nasal)': {'total_samples': 4869,
  'female_prop': 0.4113781063873485,
  'num_donors': 37,
  'num_female_donors': 16,
  'num_DA_nhoods': 212,
  'prop_DA_nhoods': 0.66},
 'Multiciliated (non-nasal)': {'total_samples': 35225,
  'female_prop': 0.352278211497516,
  'num_donors': 102,

In [25]:
total_samples_dict = {cell_type: info['total_samples'] for cell_type, info in cell_type_info.items()}
df_DS['Sample counts'] = df_DS['Cell type'].map(total_samples_dict)

female_prop_dict = {cell_type: round(info['female_prop'],2) for cell_type, info in cell_type_info.items()}
df_DS['% female samples'] = df_DS['Cell type'].map(female_prop_dict)

num_donors_dict = {cell_type: info['num_donors'] for cell_type, info in cell_type_info.items()}
df_DS['# donors'] = df_DS['Cell type'].map(num_donors_dict)

num_female_donors_dict = {cell_type: info['num_female_donors'] for cell_type, info in cell_type_info.items()}
df_DS['# female donors'] = df_DS['Cell type'].map(num_female_donors_dict)

prop_DA_dict = {cell_type: info['prop_DA_nhoods'] for cell_type, info in cell_type_info.items()}
df_DS['% DA neighborhoods'] = df_DS['Cell type'].map(prop_DA_dict)

df_DS

Unnamed: 0,Cell type,#DS,%DS,Classification trend,Sample counts,% female samples,# donors,# female donors,% DA neighborhoods
0,Basal resting,142,0.71,Inconclusive,38955,0.36,77,33,0.58
1,Multiciliated (non-nasal),42,0.208,Inconclusive,35225,0.35,102,37,0.38
2,Alveolar macrophages,33,0.164,Distinct,68487,0.29,84,27,0.71
3,EC arterial,29,0.154,Distinct,7391,0.21,77,25,0.40
4,Suprabasal,19,0.094,Distinct,41158,0.45,77,32,0.67
...,...,...,...,...,...,...,...,...,...
56,Lymphatic EC proliferating,-,-,Inconclusive,28,0.29,10,4,0.00
57,Lymphatic EC differentiating,-,-,Non-distinct,566,0.06,36,10,0.52
58,Subpleural fibroblasts,-,-,Distinct,276,0.32,17,6,0.68
59,Mesothelium,-,-,Non-distinct,230,0.03,21,4,0.93


In [26]:
round(df_DS.describe(),1)

Unnamed: 0,Sample counts,% female samples,# donors,# female donors,% DA neighborhoods
count,61.0,61.0,61.0,61.0,61.0
mean,9589.2,0.4,56.8,19.9,0.4
std,14802.4,0.2,23.6,8.2,0.2
min,28.0,0.0,9.0,4.0,0.0
25%,716.0,0.3,38.0,15.0,0.3
50%,4001.0,0.4,59.0,20.0,0.5
75%,9133.0,0.5,75.0,26.0,0.6
max,68487.0,0.9,102.0,37.0,0.9


Final table:

In [29]:
df_DS.head(61)

Unnamed: 0,Cell type,#DS,%DS,Classification trend,Sample counts,% female samples,# donors,# female donors,% DA neighborhoods
0,Basal resting,142,0.71,Inconclusive,38955,0.36,77,33,0.58
1,Multiciliated (non-nasal),42,0.208,Inconclusive,35225,0.35,102,37,0.38
2,Alveolar macrophages,33,0.164,Distinct,68487,0.29,84,27,0.71
3,EC arterial,29,0.154,Distinct,7391,0.21,77,25,0.40
4,Suprabasal,19,0.094,Distinct,41158,0.45,77,32,0.67
...,...,...,...,...,...,...,...,...,...
56,Lymphatic EC proliferating,-,-,Inconclusive,28,0.29,10,4,0.00
57,Lymphatic EC differentiating,-,-,Non-distinct,566,0.06,36,10,0.52
58,Subpleural fibroblasts,-,-,Distinct,276,0.32,17,6,0.68
59,Mesothelium,-,-,Non-distinct,230,0.03,21,4,0.93
