In [2]:
import pandas as pd
import numpy as np

### Length features learning
First, we'll load our dataframe and convert it into the rigth format

In [3]:
labels = pd.read_csv("Results/Machine_Learning/sample_map_ML.csv")
features  = pd.read_csv("Results/Machine_Learning/fragment_features_CNA_ML.csv")
# convert to long
features = pd.pivot(data = features, index = "sample", columns = "length", values = "count")

In [4]:
cancer_list = ["A", "B", "C", "Lung", "Breast"]

def binary(df):
    df["cancer"] = np.where(df["diagnosis"].isin(cancer_list), 1, 0)
    
    return df

# apply function and reshape df
labels = binary(labels)
labels.drop(columns = ["diagnosis"], inplace = True)
labels.set_index("sample", inplace = True)

In [5]:
print("0 in features: \n", features.isnull().sum().sum())
print("NANs in features: \n", features.isna().sum().sum())
print("0 in labels: \n", labels.isnull().sum().sum())
print("NANs in labels: \n", labels.isna().sum().sum())

0 in features: 
 0
NANs in features: 
 0
0 in labels: 
 0
NANs in labels: 
 0


In [6]:
print("Are the sample arrays equal?", np.array_equal(np.array(features.index), np.array(labels.index)))

Are the sample arrays equal? True


In [7]:
# convert the pd dataframes into numpy vectors
print(features.shape)
print(labels.shape)
features = features.to_numpy(copy=True) 
labels = labels.to_numpy(copy=True)
print(features.shape)
print(labels.shape)


(481, 120)
(481, 1)
(481, 120)
(481, 1)


### train test split

In [8]:
# splitting into test and train
from sklearn.model_selection import train_test_split

X, y = features, labels.ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33, shuffle = True)

In [9]:
# normalization of testst
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(X_train) # compute mean, std on train
X_test = transformer.transform(X_test)  # use same transformer for both, mimics that we do not know train data
X_train = transformer.transform(X_train)

In [10]:
X_test.sum()

-1616.8714329383492

### KNN model

In [86]:
# KNN Classifiers
from sklearn.neighbors import KNeighborsClassifier
# define parameters
n_neighbors = 3
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [87]:
y_predicted = knn.predict(X_test)
knn_result = np.equal(y_predicted, y_test)
accuracy = (knn_result.sum() / len(knn_result)) * 100
print("Accuracy: " + str(round(accuracy,2)) + " %")

Accuracy: 80.41 %


In [88]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))
# how good we are to predicting classes, important to check both classes accuracy
# support: no of samples

              precision    recall  f1-score   support

           0       0.78      0.83      0.80        46
           1       0.83      0.78      0.81        51

    accuracy                           0.80        97
   macro avg       0.80      0.81      0.80        97
weighted avg       0.81      0.80      0.80        97



### Logitsic regression model

In [11]:
from sklearn.linear_model import LogisticRegression

logReg = LogisticRegression(penalty = "l1", solver = "liblinear", C = 2.) # bigger C lower pen -> 1 / C
logReg.fit(X_train, y_train)

LogisticRegression(C=2.0, penalty='l1', solver='liblinear')

In [12]:
y_predicted = logReg.predict(X_test)
logReg_result = np.equal(y_predicted, y_test)
accuracy = (logReg_result.sum() / len(logReg_result)) * 100
print("Accuracy: " + str(round(accuracy,2)) + " %")

Accuracy: 89.69 %


In [13]:
from sklearn.metrics import classification_report, f1_score
print(classification_report(y_test, y_predicted))
print(f1_score(y_test, y_predicted, average = "macro"))
# how good we are to predicting classes, important to check both classes accuracy
# support: no of samples

              precision    recall  f1-score   support

           0       0.85      0.96      0.90        46
           1       0.96      0.84      0.90        51

    accuracy                           0.90        97
   macro avg       0.90      0.90      0.90        97
weighted avg       0.90      0.90      0.90        97

0.8968962585034014


In [21]:
len(logReg.coef_[0].nonzero()[0])

12

### random Forrest

In [92]:
from sklearn.ensemble import RandomForestClassifier

ranFor = RandomForestClassifier(n_estimators = int(X_train.shape[1]), max_depth = 6) # hyper: choose 1
ranFor.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, n_estimators=120)

In [93]:
y_predicted = ranFor.predict(X_test)
ranFor_result = np.equal(y_predicted, y_test)
accuracy = (ranFor_result.sum() / len(ranFor_result)) * 100
print("Accuracy: " + str(round(accuracy,2)) + " %")

Accuracy: 82.47 %


In [94]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))
# how good we are to predicting classes, important to check both classes accuracy
# support: no of samples

              precision    recall  f1-score   support

           0       0.75      0.93      0.83        46
           1       0.93      0.73      0.81        51

    accuracy                           0.82        97
   macro avg       0.84      0.83      0.82        97
weighted avg       0.84      0.82      0.82        97



# ichorCNA

In [95]:
labels = pd.read_csv("Results/Machine_Learning/sample_map_ML.csv")
features  = pd.read_csv("Results/Machine_Learning/tumour_fractions_ML.csv")

# convert gender 
def binary(df, samples = True):
    
    cancer_list = ["A", "B", "C", "Lung", "Breast"]
    if samples == True:
        df["cancer"] = np.where(df["diagnosis"].isin(cancer_list), 1, 0)
        df.drop(columns = ["diagnosis"], inplace = True)
        df.set_index("sample", inplace = True)
        df.sort_index(axis = 0, inplace = True)
    else:
        df["gender_binary"] = np.where(df["gender"].isin(["male"]), 1, 0)
        df.drop(columns = ["gender"], inplace = True)
        df.set_index("sample", inplace = True)
        df.sort_index(axis = 0, inplace = True)
    
    return df

features = binary(features, samples = False)
labels = binary(labels, samples = True)
features.drop(axis = 0, labels = "EE87952", inplace = True)
labels.drop(axis = 0, labels = "EE88290", inplace = True)

In [96]:
np.array_equal(labels.index, features.index)

True

In [99]:
def splitter(labels, features):
    # convert to numpy
    features = features.to_numpy(copy=True) 
    labels = labels.to_numpy(copy=True)
    # split into train test
    X, y = features, labels.ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33, shuffle = True)
    # normalize
    transformer = StandardScaler().fit(X_train) # compute mean, std on train
    X_test = transformer.transform(X_test)  # use same transformer for both, mimics that we do not know train data
    X_train = transformer.transform(X_train)
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = splitter(labels, features)

In [101]:
logReg = LogisticRegression(penalty = "l1", solver = "liblinear", C = 1.) # bigger C lower pen -> 1 / C
logReg.fit(X_train, y_train)
y_predicted = logReg.predict(X_test)
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.62      0.98      0.76        46
           1       0.96      0.44      0.60        50

    accuracy                           0.70        96
   macro avg       0.79      0.71      0.68        96
weighted avg       0.79      0.70      0.68        96



In [118]:
logReg.coef_ # proposes dropping gender

array([[ 1.90666497,  0.38157269, -0.04271875]])

### LIQUORICE


In [4]:
labels = pd.read_csv("Results/Machine_Learning/sample_map_ML.csv")
features  = pd.read_csv("Results/Machine_Learning/cell_type_signatures_LIQU_ML.csv")
features = features.pivot(index = "sample", columns = "region_set", values = ["Dip_area", "Dip_depth"]) # first 13 dip_area

In [5]:
features = features.merge(labels, on = "samples", how = "inner")

  features = features.merge(labels, on = "samples", how = "inner")


KeyError: 'samples'

In [6]:
features


Unnamed: 0_level_0,Dip_area,Dip_area,Dip_area,Dip_area,Dip_area,Dip_area,Dip_area,Dip_area,Dip_area,Dip_area,...,Dip_depth,Dip_depth,Dip_depth,Dip_depth,Dip_depth,Dip_depth,Dip_depth,Dip_depth,Dip_depth,Dip_depth
region_set,A549_hg38,HPF_lung_fibroblast_cluster1737_hg38,HeLa_cluster1777_hg38,SAEC_hg38,colon_cluster507_hg38,hematopoietic_specific_liquorice_hg38,hepatocyte_all_hg38,mammary_epithel_cluster2438_hg38,melano_SOX_clusters1863_2205_hg38,panc_adenoca_cluster1261_hg38,...,SAEC_hg38,colon_cluster507_hg38,hematopoietic_specific_liquorice_hg38,hepatocyte_all_hg38,mammary_epithel_cluster2438_hg38,melano_SOX_clusters1863_2205_hg38,panc_adenoca_cluster1261_hg38,panc_epithel_cluster1974_hg38,prostate_cluster2483_hg38,skeletal_muscle_cluster1518_hg38
sample,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
EE87786,0.76,-0.10,-0.22,-0.78,0.76,-0.50,-0.55,0.89,-0.77,-0.61,...,2.21,1.84,0.38,0.78,0.82,2.29,1.03,0.72,0.47,-0.71
EE87787,-2.16,1.22,-2.49,-3.51,0.84,1.25,1.52,-1.04,1.00,-2.03,...,3.28,0.36,0.01,0.54,0.58,0.96,0.27,-0.88,-1.49,1.81
EE87788,-0.52,-1.56,-0.13,-0.22,0.47,0.94,0.09,-0.45,0.22,0.31,...,1.14,-2.10,0.09,-0.71,-0.08,-0.41,0.13,-1.29,0.14,-1.44
EE87789,-2.78,-1.38,0.19,0.14,1.24,2.05,-2.01,-0.98,0.22,0.67,...,0.59,-1.55,-1.89,1.93,-0.49,-0.72,0.05,0.00,-0.60,-0.51
EE87790,-6.92,-0.92,-2.07,-2.13,-0.05,3.22,-5.01,-0.43,-0.33,1.09,...,0.26,1.56,-1.70,5.06,-0.87,-0.90,-0.96,2.25,2.37,-0.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
EE88321,-5.41,-0.51,-2.30,-1.66,-2.87,3.07,-4.02,0.56,-1.48,-1.74,...,1.96,1.81,-2.49,3.68,-1.34,0.54,0.29,2.79,-0.81,-0.35
EE88322,-0.75,-0.43,-1.14,-1.56,-1.06,0.49,-0.49,-0.61,-1.41,-0.18,...,1.22,1.35,-0.44,0.86,-0.97,1.43,-0.64,-0.84,0.04,-0.19
EE88323,-3.63,0.02,-1.29,1.37,-0.69,1.11,-1.74,1.32,0.43,0.05,...,-1.28,-0.47,-0.56,2.38,-2.16,-2.12,-0.06,0.44,1.35,-1.72
EE88324,-0.32,1.91,-0.15,-1.45,-0.44,-0.41,0.30,1.20,1.02,-0.44,...,0.93,-0.78,1.00,0.33,-1.04,-0.65,0.79,-2.03,1.10,-1.51


In [None]:
# drop row that is to much and index lalala