In [None]:
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
features_cnv_df = pd.read_csv("../../data/cnv_df_128_tw.csv")
features_gene_df = pd.read_csv("../../data/gene_df_128_tw.csv")
label_df = pd.read_csv("../../data/final_labels.csv")

features_cnv_df = features_cnv_df.merge(label_df[['Case_ID_Final','File ID']], left_on='file_name',right_on = 'File ID', how = 'left')
features_cnv_df.drop(columns=['File ID','file_name'],inplace = True)

features_gene_df = features_gene_df.merge(label_df[['Case_ID_Final','File ID']], left_on='file_name',right_on = 'File ID', how = 'left')
features_gene_df.drop(columns=['File ID','file_name'],inplace = True)


In [None]:
features_gene_df.head()


In [None]:
features_cnv_df.head()

In [None]:
features_df = pd.merge(left = features_cnv_df,right=features_gene_df, on='Case_ID_Final')
cases = features_df.pop('Case_ID_Final')
features_df.insert(0,'case_id',cases)
features_df

In [None]:
label_df_2 = label_df.groupby(['Case_ID_Final','LABEL','final_label','Project ID']).agg({'File ID':'nunique'}).reset_index()
label_df_2.index = label_df_2['Case_ID_Final']
label_df_2

In [None]:
labels = label_df_2.loc[features_df['case_id']]['LABEL']
labels_more = label_df_2.loc[features_df['case_id']]['final_label']
projects = label_df_2.loc[features_df['case_id']]['Project ID']

Using t-SNE

In [None]:
X = features_df.iloc[:,1:]
Y = labels
tsne = TSNE(n_components=2, verbose=1, random_state=13)
z = tsne.fit_transform(X)
df = pd.DataFrame()
df["y"] = Y
df["y_more"] = labels_more
df["projects"] = projects
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

fig = plt.gcf()
fig.set_size_inches(12, 8)

sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),
                palette=sns.color_palette("hls", 2),
                data=df).set(title="Iris data T-SNE projection") 

Using PCA


In [None]:
X = features_df.iloc[:,1:]
Y = labels
pca = PCA(n_components=2)
z = pca.fit_transform(X)
df = pd.DataFrame()
df["y"] = Y
df["comp-1"] = z[:,0]
df["comp-2"] = z[:,1]

sns.scatterplot(x="comp-1", y="comp-2", hue=df.y.tolist(),
                palette=sns.color_palette("hls", 2),
                data=df).set(title="Iris data PCA projection") 

10 fold cross validation

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from mlmodels import cross_validate_custom
VC = VotingClassifier(estimators=[('SVM', SVC(kernel = 'rbf', probability=True)),
 ('RF', RandomForestClassifier(n_estimators=100))],voting='soft')
estimators = [SVC(kernel='rbf',probability=True), RandomForestClassifier(n_estimators=100),VC]
result = []
for estimator in estimators:
    result.append(cross_validate_custom(X,Y,estimator))
result = pd.DataFrame(result,index = ['SVM','RF','Voting'])
result