In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import mygene

from scipy import stats

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler

from boruta import BorutaPy
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.feature_selection import VarianceThreshold

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.precision", 8)

In [22]:
genes = ['ENSG00000235734.4',
  'ENSG00000229237.2',
  'ENSG00000224126.2',
  'ENSG00000234743.1',
  'ENSG00000262533.1',
  'ENSG00000123500.9',
  'ENSG00000279473.1',
  'ENSG00000241431.1',
  'ENSG00000230838.1',
  'ENSG00000219928.2',
  'ENSG00000244538.1',
  'ENSG00000230734.1',
  'ENSG00000204429.4',]

In [53]:
df = pd.DataFrame(genes)
df.columns = ["ensembls"]
df

Unnamed: 0,ensembls
0,ENSG00000235734.4
1,ENSG00000229237.2
2,ENSG00000224126.2
3,ENSG00000234743.1
4,ENSG00000262533.1
5,ENSG00000123500.9
6,ENSG00000279473.1
7,ENSG00000241431.1
8,ENSG00000230838.1
9,ENSG00000219928.2


In [54]:
df["ensembls"] = df["ensembls"].apply(lambda value: re.sub("\.\d+", "", value))

Unnamed: 0,ensembls
0,ENSG00000235734
1,ENSG00000229237
2,ENSG00000224126
3,ENSG00000234743
4,ENSG00000262533
5,ENSG00000123500
6,ENSG00000279473
7,ENSG00000241431
8,ENSG00000230838
9,ENSG00000219928


In [59]:
mg = mygene.MyGeneInfo()
df_mg = mg.getgenes(df["ensembls"], fields="symbol", as_dataframe=True) # fields="symbol,name,summary"

querying 1-13...done.


In [61]:
df_mg.reset_index()

Unnamed: 0,query,_id,_score,notfound,symbol
0,ENSG00000235734,ENSG00000235734,6.740937,,HMGN1P36
1,ENSG00000229237,ENSG00000229237,6.74915,,HMGN1P37
2,ENSG00000224126,ENSG00000224126,6.756941,,UBE2SP2
3,ENSG00000234743,ENSG00000234743,6.746852,,EIF5AP4
4,ENSG00000262533,ENSG00000262533,20.981884,,AC090617.3
5,ENSG00000123500,1300,19.670546,,COL10A1
6,ENSG00000279473,,,True,
7,ENSG00000241431,ENSG00000241431,6.768179,,RPL37P6
8,ENSG00000230838,ENSG00000230838,20.818214,,LINC01614
9,ENSG00000219928,ENSG00000219928,20.915241,,AL161787.1


In [28]:
df1 = pd.read_csv("Results_['Breast']_['Primary Tumor', 'Normal Tissue'].csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,Total Count,Intogen,Gradient Boost Classifier,Recursive Feature Elimination,Elastic Net,Boruta Tree,Importances: Gradient Boost Classifier,Importances: Recursive Feature Elimination,Importances: Elastic Net,Importances: Boruta Tree,Importances: Intogen
0,ENSG00000240036.4,4,0,1,1,1,1,0.0,0.0,0.0,0.0,0.0
1,ENSG00000259357.2,4,0,1,1,1,1,0.0,0.0,0.0,0.0,0.0
2,ENSG00000268938.2,3,0,0,1,1,1,0.0,0.0,0.0,0.0,0.0
3,ENSG00000254398.1,3,0,0,1,1,1,0.0,0.0,0.0,0.0,0.0
4,ENSG00000279473.1,3,0,0,1,1,1,0.0,0.0,0.0,0.0,0.0


In [29]:
df1["Intogen"].unique()

array([0, 1])

In [30]:
df2 = df1[df1["Intogen"]==1].sum()

In [31]:
df2 = pd.DataFrame(df2)
df2.columns = ["results"]
df2

Unnamed: 0,results
Unnamed: 0,ENSG00000163513ENSG00000133703ENSG00000100284E...
Total Count,129
Intogen,129
Gradient Boost Classifier,0
Recursive Feature Elimination,0
Elastic Net,0
Boruta Tree,0
Importances: Gradient Boost Classifier,0
Importances: Recursive Feature Elimination,0
Importances: Elastic Net,0


In [32]:
df2.iloc[3:7]

Unnamed: 0,results
Gradient Boost Classifier,0
Recursive Feature Elimination,0
Elastic Net,0
Boruta Tree,0


In [39]:
with pd.option_context('display.float_format', '{:0.5f}'.format):
    print(df2)

                                                                                      results
Unnamed: 0                                  ENSG00000163513ENSG00000133703ENSG00000100284E...
Total Count                                                                               129
Intogen                                                                                   129
Gradient Boost Classifier                                                                   0
Recursive Feature Elimination                                                               0
Elastic Net                                                                                 0
Boruta Tree                                                                                 0
Importances: Gradient Boost Classifier                                                0.00000
Importances: Recursive Feature Elimination                                            0.00000
Importances: Elastic Net                                    