#### 1. Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
import re
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
import logging
import sklearn

In [2]:
stopwords = set(stopwords.words('english'))

### 2. Reading Data:

In [3]:
training_text = pd.read_csv(r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\data\training_text",\
                            sep='\|\|', engine="python",
                            names=['ID','Text'])

In [4]:
training_text

Unnamed: 0,ID,Text
0,"ID,Text",
1,0,Cyclin-dependent kinases (CDKs) regulate a var...
2,1,Abstract Background Non-small cell lung canc...
3,2,Abstract Background Non-small cell lung canc...
4,3,Recent evidence has demonstrated that acquired...
...,...,...
3317,3316,Introduction Myelodysplastic syndromes (MDS) ...
3318,3317,Introduction Myelodysplastic syndromes (MDS) ...
3319,3318,The Runt-related transcription factor 1 gene (...
3320,3319,The RUNX1/AML1 gene is the most frequent targe...


### 3. Preprocessing:

#### 3.1 Handling NaN/None values:

In [5]:
training_text[training_text["Text"].isnull()==True]

Unnamed: 0,ID,Text
0,"ID,Text",
1110,1109,
1278,1277,
1408,1407,
1640,1639,
2756,2755,


In [6]:
training_text = (
    training_text.iloc[1:]
    .dropna(subset=["Text"])
)

In [7]:
training_text

Unnamed: 0,ID,Text
1,0,Cyclin-dependent kinases (CDKs) regulate a var...
2,1,Abstract Background Non-small cell lung canc...
3,2,Abstract Background Non-small cell lung canc...
4,3,Recent evidence has demonstrated that acquired...
5,4,Oncogenic mutations in the monomeric Casitas B...
...,...,...
3317,3316,Introduction Myelodysplastic syndromes (MDS) ...
3318,3317,Introduction Myelodysplastic syndromes (MDS) ...
3319,3318,The Runt-related transcription factor 1 gene (...
3320,3319,The RUNX1/AML1 gene is the most frequent targe...


In [8]:
training_text.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3316 entries, 1 to 3321
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      3316 non-null   object
 1   Text    3316 non-null   object
dtypes: object(2)
memory usage: 77.7+ KB


#### 3.2 Cleaning Data:

- 1. Removing all special characters.

- 2. converting double space to single space.

- 3. convertig all character to lower case.

- 4. removing stop words.


##### NOTE:

In [9]:
logs_dir = "logs"
base_path = r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts"
logs_dir = os.path.join(base_path,logs_dir)

In [10]:
def return_logger(file_name,idx):
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir, exist_ok=True)
        
    log_file_path = os.path.join(logs_dir, f"preprocessing_{file_name}_log_file_{idx}.log")

    logger = logging.getLogger()

    # Remove all existing handlers
    if logger.hasHandlers():
        logger.handlers.clear()

    logging.basicConfig(filename=log_file_path, format='%(asctime)s %(message)s', filemode='w')
    
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    return logger

In [11]:
def preprocessing(text):
    if type(text) is not int:
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        text = re.sub('\s+', ' ', text)
        text = text.lower()
        text = [word for word in text.split() if word not in stopwords]
        text = " ".join(text)
        return text

In [12]:
preprocessed_text = []

In [13]:
logger = return_logger("training_text",1)

for id,txt in enumerate(training_text["Text"]):
    preprocessed_text.append(preprocessing(txt))
    logger.info(f"data preprocessed at ID-{id}")

print("Data preprocessing complete..")

Data preprocessing complete..


In [14]:
training_text["Text"] = preprocessed_text

- saving preprocessed `training_text.csv` file.

In [15]:
pre_data_path = r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\data\pre_training_text.csv"

In [16]:
training_text.to_csv(pre_data_path, index=False)

In [20]:
pd.read_csv(pre_data_path).head(5)

Unnamed: 0,ID,Text
0,0,cyclin dependent kinases cdks regulate variety...
1,1,abstract background non small cell lung cancer...
2,2,abstract background non small cell lung cancer...
3,3,recent evidence demonstrated acquired uniparen...
4,4,oncogenic mutations monomeric casitas b lineag...


In [17]:
training_variants = pd.read_csv(r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\data\training_variants")

In [18]:
training_variants

Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4
...,...,...,...,...
3316,3316,RUNX1,D171N,4
3317,3317,RUNX1,A122*,1
3318,3318,RUNX1,Fusions,1
3319,3319,RUNX1,R80C,4


In [21]:
Class = training_variants["Class"]
training_variants = training_variants.drop(columns="Class")

In [22]:
training_variants

Unnamed: 0,ID,Gene,Variation
0,0,FAM58A,Truncating Mutations
1,1,CBL,W802*
2,2,CBL,Q249E
3,3,CBL,N454D
4,4,CBL,L399V
...,...,...,...
3316,3316,RUNX1,D171N
3317,3317,RUNX1,A122*
3318,3318,RUNX1,Fusions
3319,3319,RUNX1,R80C


#### 3.3 concatenating both `training_variants` and `training_text` to create final_data for training.

In [23]:
training_text = training_text.drop(columns="ID")

In [24]:
final_data = pd.concat([training_variants,training_text], axis=1)
final_data["Class"] = Class

In [25]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3322 entries, 0 to 3321
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         3321 non-null   float64
 1   Gene       3321 non-null   object 
 2   Variation  3321 non-null   object 
 3   Text       3316 non-null   object 
 4   Class      3321 non-null   float64
dtypes: float64(2), object(3)
memory usage: 129.9+ KB


In [31]:
final_data = (
    final_data
    .dropna(subset=["ID"], axis=0)
    .dropna(subset=["Text"],axis=0)
)

In [32]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3315 entries, 1 to 3320
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         3315 non-null   float64
 1   Gene       3315 non-null   object 
 2   Variation  3315 non-null   object 
 3   Text       3315 non-null   object 
 4   Class      3315 non-null   float64
dtypes: float64(2), object(3)
memory usage: 155.4+ KB


In [33]:
final_data["ID"] = final_data.ID.astype("int")

In [42]:
final_data.to_csv(r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\data\final_data.csv", index=False)

In [43]:
final_data = pd.read_csv(r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\data\final_data.csv")

In [52]:
print("number of data points:",final_data.shape[0])
print("number of features:",final_data.shape[1])
print("features: ",final_data.columns.values)

display(final_data.head(5))

number of data points: 3315
number of features: 5
features:  ['ID' 'Gene' 'Variation' 'Text' 'Class']


Unnamed: 0,ID,Gene,Variation,Text,Class
0,1,CBL,W802*,cyclin dependent kinases cdks regulate variety...,2.0
1,2,CBL,Q249E,abstract background non small cell lung cancer...,2.0
2,3,CBL,N454D,abstract background non small cell lung cancer...,3.0
3,4,CBL,L399V,recent evidence demonstrated acquired uniparen...,4.0
4,5,CBL,V391I,oncogenic mutations monomeric casitas b lineag...,4.0


In [53]:
display(display(final_data.info()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3315 entries, 0 to 3314
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         3315 non-null   int64  
 1   Gene       3315 non-null   object 
 2   Variation  3315 non-null   object 
 3   Text       3315 non-null   object 
 4   Class      3315 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 129.6+ KB


None

None