In [151]:
# Workaround to import from dir above. In this case from "../src"
import sys
sys.path.append("..")

In [152]:
# Autoreload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Imports

In [153]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import src.utils.text_utils as txt_utils
from collections import Counter

# Global Variables

In [154]:
data_path = "../newHIV-1_data/"
preprocessed_data_path = "../preprocessed_data/"

# Load Data

## 746 Dataset

In [155]:
df_746 = pd.read_csv(data_path+"746Data.txt", sep=',', header=0, names=["octamer", "label"])
df_746.head(2)

Unnamed: 0,octamer,label
0,AAAMKRHG,-1
1,AAAMSSAI,-1


## 1625 Dataset

In [156]:
df_1625 = pd.read_csv(data_path+"1625Data.txt", sep=',', header=0, names=["octamer", "label"])
df_1625.head(2)

Unnamed: 0,octamer,label
0,AECFRIFD,1
1,HLVEALYL,1


## Impens Dataset

In [157]:
df_impens = pd.read_csv(data_path+"impensData.txt", sep=',', header=0, names=["octamer", "label"])
df_impens.head(2)

Unnamed: 0,octamer,label
0,AAAVDAGM,-1
1,AAGKSGGG,-1


## Schilling Dataset

In [158]:
df_schilling = pd.read_csv(data_path+"schillingData.txt", sep=',', header=0, names=["octamer", "label"])
df_schilling.head(2)

Unnamed: 0,octamer,label
0,AAAAPAKV,-1
1,AAAELGAR,-1


# Orthogonal Binary Representation

Author comments about the sequences representation shows that there are 20 allowed letters in the 8 character string (the allowed alphabet): `'ARNDCQEGHILKMFPSTWYV'`. These represent amino acids.

Also the 8 letter string can also be viewed as 8 independent attributes. It is common to map the 8 letters to an orthogonal binary representation; a matrix with 160 elements.

Ex.: 

$AAAMKRHG = \begin{bmatrix}
1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0\\
1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0\\
1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0\\
0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0\\
0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0\\
0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0\\
0 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0\\
0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0
\end{bmatrix}$

In [159]:
dfs = [df_746, df_1625, df_impens, df_schilling]

In [162]:
octamers = []
for df in dfs:
    df['octamer_split'] = txt_utils.split_text(df, 'octamer')
    octamers.append(np.hstack(df['octamer_split'].values))
    
octamers = np.hstack(octamers)

allowed_alphabet = Counter(' '.join(''.join(octamers.tolist())))
allowed_alphabet.pop(' ', None)
allowed_alphabet = list(allowed_alphabet.keys())
allowed_alphabet.sort()
lb = LabelBinarizer()
lb.fit(allowed_alphabet)

for df in dfs:
    df['octamer_orthogonal'] = [lb.transform(i) for i in df['octamer_split']]
    df['octamer_orthogonal'] = df['octamer_orthogonal'].apply(lambda x: np.hstack(x))
    df['label'] = np.where(df['label'] == 1, 1, 0)

In [163]:
df_746 = pd.concat([ df_746, pd.DataFrame(df_746['octamer_orthogonal'].to_list())], axis=1).drop(columns=['octamer_split', 'octamer_orthogonal'], axis=1)
df_1625 = pd.concat([ df_1625, pd.DataFrame(df_1625['octamer_orthogonal'].to_list())], axis=1).drop(columns=['octamer_split', 'octamer_orthogonal'], axis=1)
df_impens = pd.concat([ df_impens, pd.DataFrame(df_impens['octamer_orthogonal'].to_list())], axis=1).drop(columns=['octamer_split', 'octamer_orthogonal'], axis=1)
df_schilling = pd.concat([ df_schilling, pd.DataFrame(df_schilling['octamer_orthogonal'].to_list())], axis=1).drop(columns=['octamer_split', 'octamer_orthogonal'], axis=1)

In [164]:
df_746.to_csv(preprocessed_data_path+'data_746_preprocessed.csv', sep=';', index=False, encoding='utf-8')
df_1625.to_csv(preprocessed_data_path+'data_1625_preprocessed.csv', sep=';', index=False, encoding='utf-8')
df_impens.to_csv(preprocessed_data_path+'data_impens_preprocessed.csv', sep=';', index=False, encoding='utf-8')
df_schilling.to_csv(preprocessed_data_path+'data_schilling_preprocessed.csv', sep=';', index=False, encoding='utf-8')