## Category: Getting Started

### Task 1: Import Modules and Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.calibration import CalibratedClassifierCV
from scipy.sparse import hstack
from collections import defaultdict, Counter

[nltk_data] Downloading package punkt to /Users/dolu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/dolu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Category: Loading Data

### Task 2: Load and Explore the Genes and Variations Dataset

In [6]:
# Read the dataset using the pandas library.
variants_df = pd.read_csv('../data/training_variants.txt')
# Print the number of data points (rows) and features (columns).
print(variants_df.shape)
# List the name of each feature.
print(variants_df.columns)
# Print the first five data points.
print(variants_df.head())
# Print the data type of each feature.
print(variants_df.dtypes)
# Print the number of unique categories of each feature.
print(variants_df.nunique())


(3321, 4)
Index(['ID', 'Gene', 'Variation', 'Class'], dtype='object')
   ID    Gene             Variation  Class
0   0  FAM58A  Truncating Mutations      1
1   1     CBL                 W802*      2
2   2     CBL                 Q249E      2
3   3     CBL                 N454D      3
4   4     CBL                 L399V      4
ID            int64
Gene         object
Variation    object
Class         int64
dtype: object
ID           3321
Gene          264
Variation    2996
Class           9
dtype: int64


### Task 3: Load the Text Dataset

In [7]:
# Load the dataset into a DataFrame using the pandas library
try:
    text_df = pd.read_csv('../data/training_text.txt', sep='||', engine='python')
except pd.errors.ParserError as e:
    print(f"Error occurred while reading the file: {e}")
    print("Attempting to read the file with a different approach...")
    
    # If the above method fails, try reading the file manually
    with open('../data/training_text.txt', 'r') as file:
        lines = file.readlines()
    
    # Split each line into ID and Text
    data = [line.strip().split('||', 1) for line in lines]
    
    # Create DataFrame
    text_df = pd.DataFrame(data, columns=['ID', 'Text'])
    
    # Convert ID to integer
    text_df['ID'] = pd.to_numeric(text_df['ID'], errors='coerce')

# Check if the DataFrame was created successfully
if text_df is not None and not text_df.empty:
    print("Dataset loaded successfully.")
else:
    print("Failed to load the dataset.")

# Compute the number of data points (rows) and features (columns)
num_rows, num_columns = text_df.shape
print(f"Number of data points (rows): {num_rows}")
print(f"Number of features (columns): {num_columns}")

# List the name of each feature.
print("Feature names:")
print(text_df.columns.tolist())

# Print the first five data points.
print(text_df.head(5))

Error occurred while reading the file: Expected 39677 fields in line 6, saw 41313. Error could possibly be due to quotes being ignored when a multi-char delimiter is used.
Attempting to read the file with a different approach...
Dataset loaded successfully.
Number of data points (rows): 3322
Number of features (columns): 2
Feature names:
['ID', 'Text']
    ID                                               Text
0  NaN                                               None
1  0.0  Cyclin-dependent kinases (CDKs) regulate a var...
2  1.0   Abstract Background  Non-small cell lung canc...
3  2.0   Abstract Background  Non-small cell lung canc...
4  3.0  Recent evidence has demonstrated that acquired...


## Category: Text Pre-processing

### Task 4: Define the Function for Pre-processing

In [24]:
# Create a set of the stopwords of English using NLTK.
stop_words = set(stopwords.words('english'))

# Define a function to perform cleaning and preprocessing on each text field. This function should:
def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = text.split()
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Join the tokens back into a string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

### Task 5: Pre-process the Data

In [25]:
nltk.download('punkt')

text_df['Cleaned_Text'] = text_df['Text'].apply(lambda x: preprocess_text(str(x)))


[nltk_data] Downloading package punkt to /Users/dolu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Preprocessed text:
cyclindependent kinases cdks regulate variety fundamental cellular processes cdk stands one last orphan cdks activating cyclin identified kinase activity revealed previous work shown cdk silencing increases ets vets erythroblastosis virus e oncogene homolog driven activation mapk pathway confers tamoxifen resistance breast cancer cells precise mechanisms cdk modulates ets activity generally functions cdk remain elusive demonstrate cdk cyclindependent kinase identifying cyclin activating cyclin cyclin orphan cyclin product fama whose mutations cause star syndrome human developmental anomaly whose features include toe syndactyly telecanthus anogenital renal malformations show star syndromeassociated cyclin mutants unable interact cdk cyclin silencing phenocopies cdk silencing increasing craf conferring tamoxifen resistance breast cancer cells cdkcyclin phosphorylates ets vitro cells positively controls ets degradation proteasome ets protein levels increased cells deriv

### Task 6: Merge Datasets, Clean, and Impute Values

## Category: Train-test Split

### Task 7: Perform Train-test Split

### Task 8: Check Distribution of Dataset

## Category: Measure Performance Using Random Model

### Task 9: Define a Function to Plot Performance Matrices

### Task 10: Measure Metrics from a Dummy Baseline Model

## Category: Encode the Features

### Task 11: Define the Functions for Response Coding

### Task 12: Run the Function on 'Gene' and 'Variation' Features

### Task 13: Count Words in Text Field

### Task 14: Define a Function for Response Coding

### Task 15: Run the Function on Text Field

### Task 16: One-hot Encode the Features

### Task 17: Normalizing the Text Feature

## Category: Check Feature Importances

### Task 18: Train Single Feature Models

## Category: Model Training

### Task 19: Stack the Features

### Task 20: Train a Logistic Regression Model

### Task 21: Train a Random Forest Model