### 1. Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings

### 2. Extracting and saving zipfiles:

In [2]:
zip_files = [r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\training_text.zip", \
             r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\training_variants.zip"]

In [3]:
# import zipfile
# zipfile.__name__

In [4]:
import importlib
zipfile = importlib.import_module('zipfile')
# print(zipfile.__name__)

In [5]:
import os

base_path = r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts"
data_path = "data"
data_path = os.path.join(base_path,data_path)

for file in zip_files:
    if not os.path.exists(data_path):
        os.makedirs(data_path,exist_ok=True)

    with zipfile.ZipFile(file,'r') as zip_ref:
        zip_ref.extract(zip_ref.namelist().pop(),data_path)
        print("{} saved to {} folder".format(zip_ref.namelist().pop(), os.path.basename(data_path)))

training_text saved to data folder
training_variants saved to data folder


### 3. Reading Data:

In [6]:
training_variants = pd.read_csv(r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\data\training_variants")

In [7]:
print("number of data points:",training_variants.shape[0])
print("number of features:",training_variants.shape[1])
print("features: ",training_variants.columns.values)

training_variants.head()

number of data points: 3321
number of features: 4
features:  ['ID' 'Gene' 'Variation' 'Class']


Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4


#### Conclusion:

- `Gene`: the gene where this genetic mutation is located.

- `variation`: the aminoacid change for this mutation.

- `class`: 1-9 the class this genetic mutation has been classified on.

In [8]:
training_variants['Gene'].value_counts().to_frame()

Unnamed: 0_level_0,count
Gene,Unnamed: 1_level_1
BRCA1,264
TP53,163
EGFR,141
PTEN,126
BRCA2,125
...,...
FGF19,1
RAD51B,1
RAD51D,1
SHOC2,1


In [9]:
training_variants['Variation'].value_counts().to_frame()

Unnamed: 0_level_0,count
Variation,Unnamed: 1_level_1
Truncating Mutations,93
Deletion,74
Amplification,71
Fusions,34
Overexpression,6
...,...
D390Y,1
Q367P,1
M374V,1
Y371S,1


#### Conclusion:
- For each `Gene` we have multiple values of `Variation`.

In [10]:
# training_variants[(training_variants['Gene']=='BRCA1')]

In [11]:
# training_variants[(training_variants['Gene']=='BRCA1') & (training_variants['Variation']=='Deletion')]

In [12]:
training_text = pd.read_csv(r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\data\training_text",\
                            sep='\|\|', engine="python",
                            names=['ID','Text'])

In [13]:
print("number of data points:",training_text.shape[0])
print("number of features:",training_text.shape[1])
print("features: ",training_text.columns.values)

training_text.head()

number of data points: 3322
number of features: 2
features:  ['ID' 'Text']


Unnamed: 0,ID,Text
0,"ID,Text",
1,0,Cyclin-dependent kinases (CDKs) regulate a var...
2,1,Abstract Background Non-small cell lung canc...
3,2,Abstract Background Non-small cell lung canc...
4,3,Recent evidence has demonstrated that acquired...


In [14]:
training_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3322 entries, 0 to 3321
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      3322 non-null   object
 1   Text    3316 non-null   object
dtypes: object(2)
memory usage: 52.0+ KB


In [15]:
training_text[training_text["Text"].isnull()==True]

Unnamed: 0,ID,Text
0,"ID,Text",
1110,1109,
1278,1277,
1408,1407,
1640,1639,
2756,2755,


#### conclusion:
- data with `ID` `[1109, 1277, 1407, 1639, 2755]` having `NaN` values in Text column.

In [16]:
training_variants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3321 entries, 0 to 3320
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         3321 non-null   int64 
 1   Gene       3321 non-null   object
 2   Variation  3321 non-null   object
 3   Class      3321 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 103.9+ KB


In [17]:
print(training_variants["Gene"].isnull().sum())
print(training_variants["Variation"].isnull().sum())

0
0


#### Conclusion:

- No null/NaN values in training_variants file.

- As per problem statement for every ID in training_variant there will be corresponding training text, but in training variant we are having 1 data missing.