In [None]:
import pandas as pd

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV, GroupKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier


In [None]:
data = pd.read_pickle('../cytofdata/pool1_with_annotated_clusters')
data.drop('batch', axis=1, inplace=True)

data['sample'] = data['patient_number'].astype(str) + "_" + data['test_time'].astype(str)
data

In [None]:
data['cluster'].unique()

In [None]:
cluster_as_num = {
    'PreNeu1'            :18,
    'pDC'                :17,
    'PreNeu3'            :16,
    'NeutrophilsCD25'    :15,
    'PreNeu2'            :14,
    'NeutrophilsHLA'     :13,
    'LinNeg'             :12,
    'Doublets'           :11,
    'Bcells'             :10,
    'HSPCs'              :9,
    'NKcells'            :8,
    'Tc'                 :7,
    'Th'                 :6,
    'Monocytes'          :5,
    'Basophils'          :4,
    'Eosinophils'        :3,
    'NeutrophilCD16neg'  :2 ,
     'PreNeu'            :1 ,
     'Neutrophils'       :0 
}

data['cluster'] = data['cluster'].map(cluster_as_num)

In [None]:
data

In [None]:

internal_and_external = ['89Y_CD45', 
                '111Cd_CD3', 
                '112Cd_CD34',
                '113Cd_CD123', 
                '114Cd_CD66b', 
                '116Cd_HLA-DR', 
                '141Pr_CD38', 
                '142Nd_cCaspase3', 
                '143Nd_pCRKL Y207', 
                '144Nd_pTyr',
                '145Nd_CD4', 
                '146Nd_CD49d', 
                '147Sm_CD20', 
                '148Nd_CD16', 
                '149Sm_CD25',
                '150Nd_pSTAT5 Y694', 
                '151Eu_pSTAT3 S727', 
                '152Sm_CD13',
                '153Eu_pSTAT1 Y701', 
                '154Sm_CD45RA', 
                '155Gd_CD27',
                '156Gd_pp38 T180Y182', 
                '157Gd_CD8a', 
                '158Gd_pSTAT3 Y705',
                '159Tb_pMAPKAPK T334', 
                '160Gd_CD14', 
                '161Dy_CD26', 
                '162Dy_FoxP3',
                '163Dy_CD56', 
                '164Dy_CD15', 
                '165Ho_pCREB S133', 
                '166Er_MPO',
                '167Er_IL1-RAP', 
                '168Er_CD117', 
                '169Tm_CD33', 
                '170Er_pSRC Y418',
                '171Yb_pERK T202Y204', 
                '172Yb_pS6 S235S236', 
                '173Yb_STAT3tot',
                '174Yb_CD11c', 
                '175Lu_CXCR4', 
                '176Yb_pS6 S240244', 
                '195Pt_mBC2', 
                '209Bi_CD11b']    
        

external = ['89Y_CD45', 
                '111Cd_CD3', 
                '112Cd_CD34',
                '113Cd_CD123', 
                '114Cd_CD66b', 
                '116Cd_HLA-DR', 
                '141Pr_CD38', 
                '145Nd_CD4', 
                '146Nd_CD49d', 
                '147Sm_CD20', 
                '148Nd_CD16', 
                '149Sm_CD25',
                '152Sm_CD13',
                '154Sm_CD45RA', 
                '155Gd_CD27',
                '157Gd_CD8a', 
                '160Gd_CD14', 
                '161Dy_CD26', 
                '162Dy_FoxP3',
                '163Dy_CD56', 
                '164Dy_CD15', 
                '166Er_MPO',
                '167Er_IL1-RAP', 
                '168Er_CD117', 
                '169Tm_CD33', 
                '174Yb_CD11c', 
                '175Lu_CXCR4', 
                '195Pt_mBC2', 
                '209Bi_CD11b']

internal = list(set(internal_and_external) - set(external))

In [None]:
X = data[external]
y = data['cluster']
groups = data['sample']

In [None]:
model = LinearDiscriminantAnalysis()

param_grid = [
    {'solver': ['svd'], 'shrinkage': [None]},  
    {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0.1, 0.5, 0.9]}
]


outer_cv = GroupKFold(n_splits=5)
inner_cv = GroupKFold(n_splits=3)

results = []

for train_idx, test_idx in outer_cv.split(X, y, groups):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    groups_train = groups.iloc[train_idx]

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=inner_cv,
        scoring='accuracy'
    )

    grid_search.fit(X_train, y_train, groups=groups_train)
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1_scores = f1_score(y_test, predictions, average=None)
    median_f1 = np.median(f1_scores)

    

    true_frequencies = np.bincount(y_test) / len(y_test)
    predicted_frequencies = np.bincount(predictions, minlength=len(np.unique(y))) / len(predictions)
    max_diff = np.max(np.abs(true_frequencies - predicted_frequencies))
    rsse = np.sqrt(np.sum((true_frequencies - predicted_frequencies) ** 2))

    results.append({
        'accuracy': accuracy,
        'median_f1': median_f1,
        'max_diff': max_diff,
        'rsse': rsse,
        'est': grid_search.best_score_,
        'cfg': grid_search.best_params_
    })

    print('One round completed')

results_df = pd.DataFrame(results)
print(results_df)


In [None]:


model_1 = RandomForestClassifier(n_estimators=100)

outer_cv = GroupKFold(n_splits=5)

results_1 = []

for train_idx, test_idx in outer_cv.split(X, y, groups):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    groups_train = groups.iloc[train_idx]

    model_1.fit(X_train, y_train)
    predictions = model_1.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1_scores = f1_score(y_test, predictions, average=None)
    median_f1 = np.median(f1_scores)

    true_frequencies = np.bincount(y_test) / len(y_test)
    predicted_frequencies = np.bincount(predictions, minlength=len(np.unique(y))) / len(predictions)
    max_diff = np.max(np.abs(true_frequencies - predicted_frequencies))
    rsse = np.sqrt(np.sum((true_frequencies - predicted_frequencies) ** 2))

    results_1.append({
        'accuracy': accuracy,
        'median_f1': median_f1,
        'max_diff': max_diff,
        'rsse': rsse
    })

    print('One round completed')

# Print results
results_df_rfc = pd.DataFrame(results_1)
print(results_df_rfc)

In [None]:
data

In [None]:
data['sample'].unique()

In [None]:
data_test = data[data['sample'].isin(['01_0', '01_1', '01_2',
                                   '12_0', '12_1', '12_2',
                                   '08_0', '08_1', '08_2',
                                   '15_0', '15_1', '15_2',
                                   '20_0', '20_1', '20_2',
                                   '28_0', '28_1', '28_2',
                                   '35_0', '35_1', '35_2',
                                   'HDBM_663_None', 'HDPB_597_None', 'HDBM_578_None',])]

data_train = data[~data['sample'].isin(['01_0', '01_1', '01_2',
                                   '12_0', '12_1', '12_2',
                                   '08_0', '08_1', '08_2',
                                   '15_0', '15_1', '15_2',
                                   '20_0', '20_1', '20_2',
                                   '28_0', '28_1', '28_2',
                                   '35_0', '35_1', '35_2',
                                   'HDBM_663_None', 'HDPB_597_None', 'HDBM_578_None',])]



In [None]:
data_train


In [None]:
data_train['cluster'].value_counts()

In [None]:
data_test

In [None]:
data_test['cluster'].value_counts()

In [None]:
model_1 = RandomForestClassifier(n_estimators=100)

X = data_train[external]
y = data_train['cluster']

model_1.fit(X, y)

In [None]:
X_test = data_test[external]
y_test = data_test['cluster']

In [None]:
model_1.score(X_test, y_test)

In [None]:
import plotly.express as px
from sklearn.metrics import confusion_matrix

y_pred = model_1.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

class_labels = [str(i) for i in range(19)]

In [None]:
import plotly.express as px

fig = px.imshow(
    cm_normalized,
    text_auto=True,
    labels=dict(x="Predicted Class", y="Actual Class", color="Normalized Count"),
    x=class_labels,
    y=class_labels,
    color_continuous_scale='Greens'
)

fig.update_layout(
    title="Normalized Confusion  for the annotated model",
    xaxis_title="Predicted Class",
    yaxis_title="Actual Class"
)

fig.show()


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from sklearn.metrics import confusion_matrix


cm = confusion_matrix(y_test, y_pred)

cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Original Confusion Matrix", "Normalized Confusion Matrix"),
    horizontal_spacing=0.15  
)

fig.add_trace(
    go.Heatmap(
        z=cm,
        x=class_labels,
        y=class_labels,
        colorscale='Greens',
        text=cm,
        texttemplate="%{text}",
        showscale=True
    ),
    row=1, col=1
)

fig.add_trace(
    go.Heatmap(
        z=cm_normalized,
        x=class_labels,
        y=class_labels,
        colorscale='Greens',
        text=np.round(cm_normalized, 2),
        texttemplate="%{text}",
        showscale=True
    ),
    row=1, col=2
)

fig.update_layout(
    title_text="Confusion for the annotated model",
    xaxis_title="Predicted Class",
    yaxis_title="Actual Class",
    xaxis2_title="Predicted Class",
    yaxis2_title="Actual Class"
)

fig.show()


# Bruker trent modell til å predikere celle typene i pool2 

In [None]:
data_pool_2 = pd.read_pickle('../cytofdata/uncorrected_batch_5_6.pkl')

In [None]:
data_pool_2 = data_pool_2[~data_pool_2['file_id'].isin(['RT', 'RJ'])]

In [None]:
X_pool_2 = data_pool_2[external]
y_pool_2 = model_1.predict(X_pool_2)

data_pool_2['cluster'] = y_pool_2

In [None]:
data_pool_2['cluster'].value_counts()

In [None]:
data_pool_2.to_pickle('../cytofdata/data_clusters_batch5_6_annotated_mode.pkl')