# Initialization

In [22]:
!pip install -r requirements.txt

Collecting halo
  Downloading halo-0.0.31.tar.gz (11 kB)
Collecting numpy~=1.22.0
  Downloading numpy-1.22.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[K     |████████████████████████████████| 16.8 MB 476 kB/s eta 0:00:01       | 2.7 MB 984 kB/s eta 0:00:15     |███████████████████████▎        | 12.2 MB 2.1 MB/s eta 0:00:03
Collecting log_symbols>=0.0.14
  Downloading log_symbols-0.0.14-py3-none-any.whl (3.1 kB)
Collecting spinners>=0.0.24
  Downloading spinners-0.0.24-py3-none-any.whl (5.5 kB)
Collecting termcolor>=1.1.0
  Downloading termcolor-1.1.0.tar.gz (3.9 kB)
Building wheels for collected packages: halo, termcolor
  Building wheel for halo (setup.py) ... [?25ldone
[?25h  Created wheel for halo: filename=halo-0.0.31-py3-none-any.whl size=11251 sha256=6db0c2ef5268e558c3672a9d3d7327686f370b09c56ee608d1a33650dee99997
  Stored in directory: /home/wasp97/.cache/pip/wheels/bb/85/47/b7c7338ab52808105f937bd8c04aec5d98a543311ac2c8bed2
  Building wheel for term

In [1]:
from sklearn import preprocessing
from ipywidgets import interact, widgets
from encoder import Encoder
from tqdm.notebook import tqdm
from halo import HaloNotebook as Halo
import os
import threading
import math
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

## Threaded execution
Using modin and Dask slightly increases performances, but requires more RAM.

In [2]:
threaded = widgets.Checkbox(value=True, description='THREADED')
display(threaded)

Checkbox(value=True, description='THREADED')

In [3]:
if threaded.value:
    import modin.pandas as pd
    from dask.distributed import Client

    client = Client(n_workers=4, threads_per_worker=2)  # More workers = more RAM needed

else:
    import pandas as pd

## Paths 

In [4]:
input_root = '../Datasets/3_features_as_columns'
output_root = '../Datasets/4_encoded'

## Functions

In [5]:
def encode_column(column: pd.Series) -> pd.Series:
    encoder = Encoder()
    encoder.fit(column)
    encoded = encoder.transform(column)
    return encoded

In [6]:
def parse_column(df: pd.DataFrame, col: str, encoded):
    column = df[col]
    if column.dtype.kind not in 'biufc':
        try:
            df[col] = pd.to_numeric(column)
        except ValueError:
            df[col] = encode_column(column)

# Encoding

## File selection

In [7]:
file = widgets.Dropdown(options=os.listdir(input_root), description='Dataset')
encode_rows = widgets.Checkbox(value=False, description='Encode Rows')

display(file)
display(encode_rows)

Dropdown(description='Dataset', options=('clinical_data.xz', 'cnv.score.xz', 'met_Mval.xz', 'miRNA_mor.xz', 'm…

Checkbox(value=False, description='Encode Rows')

## Dataset loading

In [8]:
path = os.path.join(input_root, file.value)

with Halo(text=f'Reading {path}...', spinner='dots'):
    df = pd.read_pickle(path)
df

Output()

rownames,cg00000029,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000622,cg00000658,cg00000714,...,rs7746156,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873
TCGA-3C-AALI-01A-11D-A41Q-05,-2.603869,1.538224,2.873113,0.053155,-1.501188,-1.241061,-0.406432,-5.906057,3.094420,-2.750271,...,-0.141159,4.339062,0.794119,0.394742,0.010601,-3.195735,0.285399,4.656665,2.455302,-3.132444
TCGA-3C-AALJ-01A-31D-A41Q-05,-2.539703,1.226348,1.707185,-0.194401,2.509615,0.635800,1.727862,-5.810255,3.601902,-3.117922,...,3.510130,-5.216188,-3.878982,-0.249538,-0.348968,-3.805477,3.962994,1.326594,1.722707,0.071958
TCGA-3C-AALK-01A-11D-A41Q-05,-2.004853,0.550352,2.599570,0.374171,1.457006,0.456292,1.117586,-5.779576,3.650185,-2.954419,...,0.113601,-0.525432,-0.168674,0.327360,-4.182341,-3.356152,-4.587394,0.161638,2.121838,0.048997
TCGA-4H-AAAK-01A-12D-A41Q-05,-1.954741,-0.323426,2.765020,0.266002,1.968535,-0.179517,-1.915437,-5.734420,2.451984,-3.274049,...,3.638206,-5.413022,-0.158809,0.221935,4.140603,0.039376,3.858098,0.142678,-3.822093,1.813841
TCGA-5L-AAT0-01A-12D-A41Q-05,-1.674771,-2.869376,2.567064,0.175472,-0.515265,0.506464,-1.590725,-6.089853,3.300851,-3.210680,...,-0.032723,-0.713152,0.140814,0.199548,4.182766,-0.089992,4.304443,0.082344,-0.401430,2.051414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WT-AB44-01A-11D-A41Q-05,-2.214337,-2.074693,2.956829,0.685125,0.808533,0.988514,-1.144675,-5.886374,2.697420,-3.082094,...,3.444016,4.137865,-3.988534,-0.987225,0.077987,0.211145,0.366014,-4.214572,-4.107949,2.298153
TCGA-XX-A899-01A-11D-A36K-05,-1.341284,-0.053995,3.229504,1.417839,1.567472,0.248378,-1.571977,-6.003130,2.916525,-2.522104,...,4.567456,-0.421801,-0.304679,0.392110,-4.618586,3.665946,4.402962,0.083974,0.022129,2.976968
TCGA-XX-A89A-01A-11D-A36K-05,-1.587865,-2.381672,2.903377,1.089578,0.352567,-1.384112,-1.927496,-6.348300,3.233553,-3.438578,...,-4.796471,4.584291,-4.020543,4.722984,4.321214,-0.058157,-0.311495,-4.777298,-1.015397,3.254974
TCGA-Z7-A8R5-01A-42D-A41Q-05,-1.738282,-0.923777,2.279867,0.067235,2.531381,0.006142,-0.650171,-5.950524,3.386057,-2.449242,...,-0.053493,-0.705414,-0.275756,4.136991,0.141094,2.718227,-4.002837,4.443683,-3.782001,2.022517


## Casting

In [9]:
if encode_rows.value:
    print('Transposing...')
    df = df.transpose()
    print('Transposed')

with Halo(text='Selecting non-numeric columns...', spinner='dots'):
    non_numeric = df.select_dtypes(exclude=['int64', 'float64']).columns

with Halo(text='Casting...', spinner='dots'):
    # Try to cast all dataframe columns to numeric type (int64 or float64)
    df = df.apply(pd.to_numeric, errors='ignore')


with Halo(text='Selecting non-numeric columns...', spinner='dots'):
    non_numeric = df.select_dtypes(exclude=['int64', 'float64']).columns


encoded = 0
columns_n = len(non_numeric)
if columns_n > 0:
    # Encode all non-numeric columns (eg. columns with strings)
    for col in (bar := tqdm(non_numeric)):
        bar.set_description(col)
        column = df[col]
        bar.set_description(f'Encoding {col}')
        df[col] = encode_column(column)
        encoded += 1

if encode_rows.value:
    print('Transposing...')
    df = df.transpose()
    print('Transposed')

print(f'Encoded {encoded}/{columns_n} columns.')
df

Output()

Output()

Output()

Encoded 0/0 columns.


rownames,cg00000029,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000622,cg00000658,cg00000714,...,rs7746156,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873
TCGA-3C-AALI-01A-11D-A41Q-05,-2.603869,1.538224,2.873113,0.053155,-1.501188,-1.241061,-0.406432,-5.906057,3.094420,-2.750271,...,-0.141159,4.339062,0.794119,0.394742,0.010601,-3.195735,0.285399,4.656665,2.455302,-3.132444
TCGA-3C-AALJ-01A-31D-A41Q-05,-2.539703,1.226348,1.707185,-0.194401,2.509615,0.635800,1.727862,-5.810255,3.601902,-3.117922,...,3.510130,-5.216188,-3.878982,-0.249538,-0.348968,-3.805477,3.962994,1.326594,1.722707,0.071958
TCGA-3C-AALK-01A-11D-A41Q-05,-2.004853,0.550352,2.599570,0.374171,1.457006,0.456292,1.117586,-5.779576,3.650185,-2.954419,...,0.113601,-0.525432,-0.168674,0.327360,-4.182341,-3.356152,-4.587394,0.161638,2.121838,0.048997
TCGA-4H-AAAK-01A-12D-A41Q-05,-1.954741,-0.323426,2.765020,0.266002,1.968535,-0.179517,-1.915437,-5.734420,2.451984,-3.274049,...,3.638206,-5.413022,-0.158809,0.221935,4.140603,0.039376,3.858098,0.142678,-3.822093,1.813841
TCGA-5L-AAT0-01A-12D-A41Q-05,-1.674771,-2.869376,2.567064,0.175472,-0.515265,0.506464,-1.590725,-6.089853,3.300851,-3.210680,...,-0.032723,-0.713152,0.140814,0.199548,4.182766,-0.089992,4.304443,0.082344,-0.401430,2.051414
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WT-AB44-01A-11D-A41Q-05,-2.214337,-2.074693,2.956829,0.685125,0.808533,0.988514,-1.144675,-5.886374,2.697420,-3.082094,...,3.444016,4.137865,-3.988534,-0.987225,0.077987,0.211145,0.366014,-4.214572,-4.107949,2.298153
TCGA-XX-A899-01A-11D-A36K-05,-1.341284,-0.053995,3.229504,1.417839,1.567472,0.248378,-1.571977,-6.003130,2.916525,-2.522104,...,4.567456,-0.421801,-0.304679,0.392110,-4.618586,3.665946,4.402962,0.083974,0.022129,2.976968
TCGA-XX-A89A-01A-11D-A36K-05,-1.587865,-2.381672,2.903377,1.089578,0.352567,-1.384112,-1.927496,-6.348300,3.233553,-3.438578,...,-4.796471,4.584291,-4.020543,4.722984,4.321214,-0.058157,-0.311495,-4.777298,-1.015397,3.254974
TCGA-Z7-A8R5-01A-42D-A41Q-05,-1.738282,-0.923777,2.279867,0.067235,2.531381,0.006142,-0.650171,-5.950524,3.386057,-2.449242,...,-0.053493,-0.705414,-0.275756,4.136991,0.141094,2.718227,-4.002837,4.443683,-3.782001,2.022517


## Output to pickle

In [None]:
path = os.path.join(output_root, file.value)

with Halo(text=f'Writing {path}...', spinner='dots'):
    df.to_pickle(path)
print('Done')

Output()