In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,normalized_mutual_info_score
from sklearn.metrics._classification import accuracy_score, log_loss
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import SVC 
# from sklearn.cross_validation import StratifiedClassifierCV
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn import model_selection

from sklearn.ensemble import RandomForestClassifier
import math

from imblearn.over_sampling import SMOTE
from mlxtend.classifier import StackingClassifier

from collections import Counter,defaultdict
from scipy.sparse import hstack

import pandas as pd

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
final_data = pd.read_csv(r"U:\nlp_project\Personalized-Medicine-Redefining-Cancer-Treatment\artifacts\data\final_data.csv")

In [None]:
final_data.head(5)

In [None]:
final_data.info()

In [None]:
# final_data.shape[0]

### 1. train-test-split:

In [None]:
y_true = final_data['Class']

In [None]:
final_data.Gene = final_data.Gene.str.replace('\s+', '_')
final_data.Variation = final_data.Gene.str.replace('\s+', '_')

X_train, X_test, y_train, y_test = train_test_split(final_data, y_true, random_state=42, test_size=0.20)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=45, test_size=0.20)

In [None]:
print(f"Number of datapoints in train dataset: {X_train.shape[0]}")
print(f"Number of datapoints in test dataset: {X_test.shape[0]}")
print(f"Number of datapoints in validation dataset: {X_val.shape[0]}")

### 2. Distribution of class in train, test and split:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_class_distibution = y_train.value_counts()
test_class_distribution = y_test.value_counts()
validation_class_distribution = y_val.value_counts()

In [None]:
train_class_distibution.plot(kind='bar')
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("train_class_distibution")

- So we have multi-class classification problem with highly imbalanced data.

In [None]:
test_class_distribution.plot(kind="bar")
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("test_class_distribution")

- Train and test have almost similar distribution on class.

In [None]:
validation_class_distribution.plot(kind="bar")
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("validation_class_distribution")

#### Is Geni and Variation feature are stable accross all the Train, Test and Validation data?

In [None]:
len(set(X_train["Gene"]))

In [None]:
test_coverage = len(set(X_test["Gene"]) & set(X_train["Gene"]))
validation_coverage = len(set(X_val["Gene"]) & set(X_train["Gene"]))

print(f"in test data {test_coverage} out of {X_test.shape[0]}")
print(f"in test data {validation_coverage} out of {X_val.shape[0]}")

In [None]:
test_coverage = len(set(X_test["Variation"]) & set(X_train["Variation"]))
validation_coverage = len(set(X_val["Variation"]) & set(X_train["Variation"]))

print(f"in test data {test_coverage} out of {X_test.shape[0]}")
print(f"in test data {validation_coverage} out of {X_val.shape[0]}")

In [None]:
unique_words = []
for txt in X_train["Text"]:
    unique_word = list(set(txt.split()))
    unique_words.extend(unique_word) 

print(f"total unique words {len(unique_words)}")

In [None]:
unique_words = pd.Series(unique_words)
unique_words_value_counts = unique_words.value_counts()

In [None]:
unique_words_value_counts

In [None]:
s = sum(unique_words_value_counts.values);
h = unique_words_value_counts.values/s;

plt.plot(h, label="Histogram of unique words")
plt.xlabel("Index of words")
plt.ylabel("Number of occurences")
plt.legend()
plt.grid()
plt.show()

In [None]:
import numpy as np

In [None]:
c = np.cumsum(h)
plt.plot(c, label="Cumulative distribution of genes")
plt.grid(axis="both")
plt.legend()
plt.show()

### 3 Prediction Using Random Model:

In [None]:
def plot_confusion_matrics(y_test, y_pred):
    label = [1,2,3,4,5,6,7,8,9]
    C = confusion_matrix(y_test, y_pred)

    A = (((C.T) / C.sum(axis=1)).T)

    B = (C/C.sum(axis=0))

    print("-"*80, "Confusion Matrix", "-"*20)
    plt.figure(figsize=(20,7))
    sns.heatmap(C, annot=True, fmt=".3f", xticklabels=label,yticklabels=label)
    plt.xlabel("Predicted Values")
    plt.ylabel("Original Values")
    plt.show()
    
    print("-"*80, "Precision Matrix (Column sum=1)", "-"*20)
    plt.figure(figsize=(20,7))
    sns.heatmap(B, annot=True, fmt=".3f", xticklabels=label,yticklabels=label)
    plt.xlabel("Predicted Values")
    plt.ylabel("Original Values")
    plt.show()
    
    print("-"*80, "Recall Matrix (Column sum=1)", "-"*20)
    plt.figure(figsize=(20,7))
    sns.heatmap(A, annot=True, fmt=".3f", xticklabels=label,yticklabels=label)
    plt.xlabel("Predicted Values")
    plt.ylabel("Original Values")
    plt.show()

In [None]:
import numpy as np
from sklearn.metrics._classification import log_loss

#### 3.1: validation-set-error:

In [None]:
cv_data_len = X_val.shape[0]
cv_predicted_y = np.zeros((cv_data_len,9)) # cv_predicted_y will have 531-col, 9-rows
for i in range(cv_data_len):
    random_probability = np.random.rand(1,9)
    cv_predicted_y[i] = random_probability / random_probability.sum()
print("log-loss on cross validation data using Random Model", log_loss(y_val,cv_predicted_y))

#### 3.2 Test-Set-Error:

In [None]:
test_data_len = X_test.shape[0]

test_predicted_y = np.zeros((test_data_len,9)) # cv_predicted_y will have 531-col, 9-rows
for i in range(test_data_len):
    random_probability = np.random.rand(1,9)
    test_predicted_y[i] = random_probability / random_probability.sum()
print("log-loss on test data using Random Model", log_loss(y_test,test_predicted_y))

In [None]:
predicted_y = np.argmax(test_predicted_y, axis=1)

In [None]:
plot_confusion_matrics(y_test, predicted_y)

### 4. Is Geni and variation feature are important for predicting classes: 

#### 4.1: Deciding is Gine model is good at predicting class:

In [None]:
count_vectorizer_gine = CountVectorizer()

gene_x_train_ohe = count_vectorizer_gine.fit_transform(X_train["Gene"])
gene_x_test_ohe = count_vectorizer_gine.transform(X_test["Gene"])
gene_x_val_ohe = count_vectorizer_gine.transform(X_val["Gene"])

In [None]:
alpha = [10 ** x for x in range(-5,1)]

cv_log_error_array = []

for i in alpha:
    clf = SGDClassifier(alpha=i,penalty="l2", loss="log_loss",random_state=42)
    clf.fit(gene_x_train_ohe, y_train)
    sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
    sig_clf.fit(gene_x_train_ohe, y_train)
    pred_y = sig_clf.predict_proba(gene_x_val_ohe)
    cv_log_error_array.append(log_loss(y_val,pred_y,labels=clf.classes_))

    print(f"for value of alpha {i},The log_loss is: {log_loss(y_val,pred_y)}")

#### Training Gene feature on best found alpha value:

In [None]:
best_alpha= np.argmin(cv_log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha],penalty="l2", loss="log_loss",random_state=42)
clf.fit(gene_x_train_ohe, y_train)
sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
sig_clf.fit(gene_x_train_ohe, y_train)


pred_y = sig_clf.predict_proba(gene_x_train_ohe)
print("log loss for train: ", log_loss(y_train,pred_y))
pred_y = sig_clf.predict_proba(gene_x_test_ohe)
print("log loss for test: ", log_loss(y_test,pred_y))
pred_y = sig_clf.predict_proba(gene_x_val_ohe)
print("log loss for validation: ", log_loss(y_val,pred_y))

#### 4.2: Deciding is Variation feature is good at predicting class:

In [None]:
count_vectorizer_variation = CountVectorizer()

Variation_x_train_ohe = count_vectorizer_variation.fit_transform(X_train["Variation"])
Variation_x_test_ohe = count_vectorizer_variation.transform(X_test["Variation"])
Variation_x_val_ohe = count_vectorizer_variation.transform(X_val["Variation"])

#### Training Variation feature on best found alpha value:


In [None]:
alpha = [10 ** x for x in range(-5,1)]

cv_log_error_array = []

for i in alpha:
    clf = SGDClassifier(alpha=i,penalty="l2", loss="log_loss",random_state=42)
    clf.fit(Variation_x_train_ohe, y_train)
    sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
    sig_clf.fit(Variation_x_train_ohe, y_train)
    pred_y = sig_clf.predict_proba(Variation_x_val_ohe)
    cv_log_error_array.append(log_loss(y_val,pred_y,labels=clf.classes_))

    print(f"for value of alpha {i},The log_loss is: {log_loss(y_val,pred_y)}")

In [None]:
best_alpha= np.argmin(cv_log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha],penalty="l2", loss="log_loss",random_state=42)
clf.fit(Variation_x_train_ohe, y_train)
sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
sig_clf.fit(Variation_x_train_ohe, y_train)


pred_y = sig_clf.predict_proba(Variation_x_train_ohe)
print("log loss for train: ", log_loss(y_train,pred_y))
pred_y = sig_clf.predict_proba(Variation_x_test_ohe)
print("log loss for test: ", log_loss(y_test,pred_y))
pred_y = sig_clf.predict_proba(Variation_x_val_ohe)
print("log loss for validation: ", log_loss(y_val,pred_y))

In [None]:
conclusion_df = {
    "cv_loss": [1.14,1.16,2.49],
    "test_loss": [1.16,1.14,2.48],
}

pd.DataFrame(conclusion_df, index=["gene", "variation", "random model"])

#### conclusion:
- As we can see both are fetures are important,  so keep both feature for predicting class

#### 4.3: Deciding is test feature is good at predicting class:

- How many unique values are present in train data.
- how are text frequency distributed?
- How to featurize text field?
- Is text feature usefull for predicting y?
- Is the text feature stable across Train,test and validation data?

In [None]:
text_vectorizer = CountVectorizer(min_df=3) # min_df: take those words only which occured atleast 3 times
train_txt_ohe = text_vectorizer.fit_transform(X_train["Text"])
train_text_features = text_vectorizer.get_feature_names_out()

train_text_feature_counts = train_txt_ohe.sum(axis=0).A1

text_feature_dict = dict(zip(list(train_text_features), train_text_feature_counts))
print(f"total_number of unique words in train data: {len(train_text_features)}")

In [None]:
# https://stackoverflow.com/a/16202486

# convert each row values such that they sum sum to 1, i.e normalize response coding

In [None]:
# Normalizing every feature
train_txt_ohe = normalize(train_txt_ohe)
test_text_ohe = normalize(text_vectorizer.transform(X_test["Text"]))
val_text_ohe = normalize(text_vectorizer.transform(X_val["Text"]))

In [None]:
# # https://stackoverflow.com/a/2258273/4084039
# sorted_text_feature_dict = dict(sorted(text_feature_dict.items(), key=lambda x:x[1]))
# sorted_text_occur = np.array(list(sorted_text_feature_dict.values()))

In [None]:
# print(Counter(sorted_text_occur))

- 5306 word occur 3 times
- 4037 words occur 4 times
-
-
-
- and so on

#### Training text feature on best found alpha value:

In [None]:
alpha = [10 ** x for x in range(-5,1)]

cv_log_error_array = []

for i in alpha:
    clf = SGDClassifier(alpha=i,penalty="l2", loss="log_loss",random_state=42)
    clf.fit(train_txt_ohe, y_train)
    sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
    sig_clf.fit(train_txt_ohe, y_train)
    pred_y = sig_clf.predict_proba(val_text_ohe)
    cv_log_error_array.append(log_loss(y_val,pred_y,labels=clf.classes_))

    print(f"for value of alpha {i},The log_loss is: {log_loss(y_val,pred_y)}")

In [None]:
best_alpha= np.argmin(cv_log_error_array)
clf = SGDClassifier(alpha=alpha[best_alpha],penalty="l2", loss="log_loss",random_state=42)
clf.fit(train_txt_ohe, y_train)
sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
sig_clf.fit(train_txt_ohe, y_train)


pred_y = sig_clf.predict_proba(train_txt_ohe)
print("log loss for train: ", log_loss(y_train,pred_y))
pred_y = sig_clf.predict_proba(test_text_ohe)
print("log loss for test: ", log_loss(y_test,pred_y))
pred_y = sig_clf.predict_proba(val_text_ohe)
print("log loss for validation: ", log_loss(y_val,pred_y))

In [None]:
conclusion_df = {
    "cv_loss": [1.14,1.16,1.34,2.49],
    "test_loss": [1.16,1.14,1.32,2.48],
}

pd.DataFrame(conclusion_df, index=["gene", "variation", "text","random model"])

In [None]:
def get_intersaction_text(df):
    df_text_vec = CountVectorizer(max_df=3)
    df_text_fea = df_text_vec.fit_transform(df["Text"])
    df_text_feature =df_text_vec.get_feature_names_out()
    df_text_fea_counts = df_text_fea.sum(axis=0).A1
    df_text_fea_dict = dict(zip(list(df_text_feature), df_text_fea_counts))
    len1 = len(set(df_text_feature))
    len2 = len(set(train_text_features) & set(df_text_feature))
    return len1,len2

In [None]:
len1, len2 = get_intersaction_text(X_test)
print(np.round((len1/len2)*100,3), "% of word of test data appeared in training data.")
len1, len2 = get_intersaction_text(X_val)
print(np.round((len2/len1)*100,3), "% of word of validation data appeared in training data.")

## 5. Machine Learning Models:

In [None]:
y_train = y_train.astype("int")
y_test = y_test.astype("int")
y_val = y_val.astype("int")

In [None]:
def predict_and_plot_confusion_matrix(train_x, train_y, x_test, y_test, clf):
    clf.fit(train_x,train_y)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf.fit(train_x, train_y)
    y_pred = sig_clf.predict(x_test)


    print(f"Log_loss {log_loss(y_test, sig_clf.predict_proba(x_test))}")
    print(f"number of missclassified points: {np.count_nonzero((y_pred-x_test))}")

    plot_confusion_matrics(y_test,y_pred)

In [None]:
def report_log_loss(X_train, y_train, X_test, y_test, clf):
    clf.fit(X_train, y_train)
    sig_clf = CalibratedClassifierCV(clf, method="sigmoid")
    sig_clf_probs = sig_clf.predict_proba(X_test)
    return log_loss(y_test, sig_clf_probs, eps=1e-15)

In [None]:
# # For Naive Bias Classifier
# def important_feature_names(indices, text, gene, var, no_features):
#     gene_count_vectorizer = CountVectorizer()
#     variance_count_vectorizer = CountVectorizer()
#     text_count_vectorizer = CountVectorizer(min_df=3)

#     gene_vec = gene_count_vectorizer.fit(X_train["Gene"])
#     variance_vec = variance_count_vectorizer.fit(X_train["Variance"])    
#     text_vec = text_count_vectorizer.fit(X_train["Text"])

#     fea1_len = len(gene_vec.get_feature_names_out())
#     fea2_len = len(variance_vec.get_feature_names_out())

#     word_present = 0
#     for i,v in enumerate(indices):
#         if (v < fea1_len):
#             word = gene_vec.get_feature_names_out()[v]
#             yes_no = True if word == gene else False
#             if yes_no:
#                 word_present += 1
#                 print(i, "Gene feature [{}] present in test data point [{}]")
        
#         elif (v < fea1_len + fea2_len):
#             word = variance_vec.get_feature_names_out()[v-(fea1_len)]
#             yes_no = True if word==var else False
#             if yes_no:
#                 word_present += 1
#                 print(i, "Variation feature [{}] present in test data point [{}]")

#         else:
#             word = text_vec.get_feature_names_out()[v]
#             yes_no = True if word == gene else False
#             if yes_no:
#                 word_present += 1
#                 print(i, "text feature [{}] present in test data point [{}]")
    
#     print("Out of the top ", no_features, "features", word_present, "are present")

### prepairing train-test-validation data:

### 1. CountVectorizer:

In [None]:
train_df = hstack((gene_x_train_ohe,Variation_x_train_ohe))
train_df_ohe = hstack((train_df, train_txt_ohe))

In [None]:
test_df = hstack((gene_x_test_ohe,Variation_x_test_ohe))
test_df_ohe = hstack((test_df, test_text_ohe))

In [None]:
validation_df = hstack((gene_x_val_ohe,Variation_x_val_ohe))
validation_df_ohe = hstack((validation_df, val_text_ohe))

In [None]:
print("One hot encoding features :")

print("(Numner of data points * Number of features) in train data = ", train_df_ohe.shape)
print("(Numner of data points * Number of features) in test data = ", test_df_ohe.shape)
print("(Numner of data points * Number of features) in validation data = ", validation_df_ohe.shape)

### 2. ResponseCoding :

In [None]:
from response_encoding import CategoricalMeanValueReplacement,TextMeanValueReplacement
vectorizer = CountVectorizer()

In [None]:
gene_cmvr = CategoricalMeanValueReplacement()
variation_cmvr = CategoricalMeanValueReplacement()
tmvr = TextMeanValueReplacement(X_train, "Text", "Class", 1, 9, vectorizer)

In [None]:
gene_cmvr.fit(1,X_train,"Gene","Class")
variation_cmvr.fit(1,X_train,"Variation","Class")
tmvr.fit()

In [None]:
train_gene_response_encoding = gene_cmvr.transform(X_train)
test_gene_response_encoding = gene_cmvr.transform(X_test)
validation_gene_response_encoding = gene_cmvr.transform(X_val)

In [None]:
train_variation_response_encoding = variation_cmvr.transform(X_train)
test_variation_response_encoding = variation_cmvr.transform(X_test)
validation_variation_response_encoding = variation_cmvr.transform(X_val)

In [None]:
train_text_response_encoding = tmvr.transform(X_train,"Class")
test_text_response_encoding = tmvr.transform(X_test,"Class")
validation_text_response_encoding = tmvr.transform(X_val,"Class")

In [None]:
import numpy as np

In [None]:
train_df_re = np.hstack((train_gene_response_encoding,train_variation_response_encoding))
train_df_re = np.hstack((train_df_re,train_text_response_encoding))

In [None]:
test_df_re = np.hstack((test_gene_response_encoding,test_variation_response_encoding))
test_df_re = np.hstack((test_df_re,test_text_response_encoding))

In [None]:
validation_df_re = np.hstack((validation_gene_response_encoding,validation_variation_response_encoding))
validation_df_re = np.hstack((validation_df_re,validation_text_response_encoding))

In [None]:
print("One hot encoding features :")

print("(Numner of data points * Number of features) in train data = ", train_df_re.shape)
print("(Numner of data points * Number of features) in test data = ", test_df_re.shape)
print("(Numner of data points * Number of features) in validation data = ", validation_df_re.shape)

#### 5.1 Naive Bayes:

In [None]:
alpha = [0.00001,0.0001,0.001,0.1,1,10,100,1000]
cv_log_error_array = []
for i in alpha:
    print(f"for alpha {i}")
    clf = MultinomialNB(alpha=i)
    clf.fit(train_df_ohe, y_train)
    sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
    sig_clf.fit(train_df_ohe, y_train)
    sig_clf_prob = sig_clf.predict_proba(validation_df_ohe)
    print(len(y_val), len(sig_clf_prob))
    cv_log_error_array.append(log_loss(y_val, sig_clf_prob, labels=clf.classes_))
    print("log loss :", log_loss(y_val, sig_clf_prob))

fig, ax = plt.subplots()
ax.plot(np.log10(alpha), cv_log_error_array, c="g")
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]), cv_log_error_array[i]))
plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.legend()

best_alpha = np.argmin(cv_log_error_array)
clf = MultinomialNB(alpha=alpha[best_alpha])
clf.fit(train_df_ohe, y_train)
sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
sig_clf.fit(train_df_ohe, y_train)


sig_clf_prob = sig_clf.predict_proba(train_df_ohe)
print("for best alpha ", alpha[best_alpha], "The train log loss is: ",log_loss(y_train, sig_clf_prob, labels=clf.classes_))

sig_clf_prob = sig_clf.predict_proba(validation_df_ohe)
print("for best alpha ", alpha[best_alpha], "The validation log loss is: ",log_loss(y_val, sig_clf_prob, labels=clf.classes_))

sig_clf_prob = sig_clf.predict_proba(test_df_ohe)
print("for best alpha ", alpha[best_alpha], "The test log loss is: ",log_loss(y_test, sig_clf_prob, labels=clf.classes_))

In [None]:
# from confusion matrix: of all the points get predicted as class-1 among them 47% are actually belongs to class-1
# from precision matrix: of all the points get predicted as class-1 among them 30% are actually belongs to class-4
# from recall matrix: of all the points belongs class-6 among them 13% are predicted as class-1, 39% predicted as class-6 it self

In [None]:
sig_clf_prob = sig_clf.predict(validation_df_ohe)
plot_confusion_matrics(y_val,sig_clf_prob)

### 5.2 K-Nearest Neighbour Classification:

In [None]:
# neighbors = [2,5,6,7,8,10,12]
neighbors = [5,11,15,21,31,41,51,99]

cv_log_error_array = []
for k in neighbors:
    print(f"for neighbor {k}")
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(train_df_re, y_train)
    sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
    sig_clf.fit(train_df_re, y_train)
    sig_clf_prob = sig_clf.predict_proba(validation_df_re)
    cv_log_error_array.append(log_loss(y_val, sig_clf_prob, labels=clf.classes_))
    print("log loss :", log_loss(y_val, sig_clf_prob))

fig, ax = plt.subplots()
ax.plot(neighbors, cv_log_error_array, c="g")
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((neighbors[i],str(txt)), (neighbors[i], cv_log_error_array[i]))
plt.grid()
plt.xticks(np.log10(neighbors))


plt.title("Cross Validation Error for each k")
plt.xlabel("neighbors i's")
plt.ylabel("Error measure")
plt.legend()

best_k = np.argmin(cv_log_error_array)
clf = KNeighborsClassifier(n_neighbors=neighbors[best_alpha])
clf.fit(train_df_re, y_train)
sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
sig_clf.fit(train_df_re, y_train)


sig_clf_prob = sig_clf.predict_proba(train_df_re)
print("for best k :", neighbors[best_k], "The train log loss is: ",log_loss(y_train, sig_clf_prob, labels=clf.classes_))

sig_clf_prob = sig_clf.predict_proba(validation_df_re)
print("for best k :", neighbors[best_k], "The validation log loss is: ",log_loss(y_val, sig_clf_prob, labels=clf.classes_))

sig_clf_prob = sig_clf.predict_proba(test_df_re)
print("for best k :", neighbors[best_k], "The test log loss is: ",log_loss(y_test, sig_clf_prob, labels=clf.classes_))

In [None]:
sig_clf_prob = sig_clf.predict(validation_df_re)
plot_confusion_matrics(y_val,sig_clf_prob)

In [None]:
# plt.figure(figsize=(8,6))
# for i in range(1,10):
#     probabilities = sig_clf.predict_proba(np.array(train_df_re))[:,1]
#     prob_true, prob_pred = calibration_curve(y_train==i, probabilities, n_bins=15)
#     plt.plot(prob_pred,prob_true,label=f"class-{i}")
#     plt.plot([0,1],[1,0], linestyle="--", color="gray", label="idel_calibration")
#     plt.xlabel("mean predicted probability")
#     plt.ylabel("fraction of positives")
#     plt.legend()
#     plt.show()

#### 5.3 LogisticRegression + CalibratedClassifierCV

In [None]:
alpha = [0.00001,0.0001,0.001,0.1,1,10,100,1000]
cv_log_error_array = []
for i in alpha:
    print(f"for alpha {i}")
    clf = LogisticRegression(C=i, class_weight="balanced",multi_class="ovr")
    clf.fit(np.array(train_df_re), y_train)
    sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
    sig_clf.fit(np.array(train_df_re), y_train)
    sig_clf_prob = sig_clf.predict_proba(np.array(validation_df_re))
    print(len(y_val), len(sig_clf_prob))
    cv_log_error_array.append(log_loss(y_val, sig_clf_prob, labels=clf.classes_))
    print("log loss :", log_loss(y_val, sig_clf_prob))

fig, ax = plt.subplots()
ax.plot(np.log10(alpha), cv_log_error_array, c="g")
for i, txt in enumerate(np.round(cv_log_error_array,3)):
    ax.annotate((alpha[i],str(txt)), (np.log10(alpha[i]), cv_log_error_array[i]))
plt.grid()
plt.xticks(np.log10(alpha))
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.legend()


best_alpha = np.argmin(cv_log_error_array)
clf = MultinomialNB(alpha=alpha[best_alpha])
clf.fit(np.array(train_df_re), y_train)
sig_clf = CalibratedClassifierCV(clf,method="sigmoid")
sig_clf.fit(np.array(train_df_re), y_train)


sig_clf_prob = sig_clf.predict_proba(np.array(train_df_re))
print("for best alpha ", alpha[best_alpha], "The train log loss is: ",log_loss(y_train, sig_clf_prob, labels=clf.classes_))

sig_clf_prob = sig_clf.predict_proba(np.array(validation_df_re))
print("for best alpha ", alpha[best_alpha], "The validation log loss is: ",log_loss(y_val, sig_clf_prob, labels=clf.classes_))

sig_clf_prob = sig_clf.predict_proba(np.array(test_df_re))
print("for best alpha ", alpha[best_alpha], "The test log loss is: ",log_loss(y_test, sig_clf_prob, labels=clf.classes_))

### 6. Deep Learning Model:

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder

In [None]:
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

#### 6.1: Prepairing Data: 

In [None]:
import nltk
import spacy

In [None]:
torch.cuda.memory_allocated()

In [None]:
torch.cuda.max_memory_allocated()

In [None]:
# !python -m spacy download en_core_web_sm --user

In [None]:
# spacy.__file__

In [None]:
# spacy.util.get_installed_models()

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
train_text = X_train["Text"].tolist()
test_text = X_test["Text"].tolist() 
validation_text = X_val["Text"].tolist()

def get_tokenized_text(texts):
    tokenized_text = []
    for text in texts:
        tokenized_text.append(nlp(text))  # Assuming nlp is a spaCy tokenizer
    return tokenized_text

def create_vocabulary(tokenized_text):
    unique_word = set(str(word) for sublist in tokenized_text for word in sublist)  # Convert tokens to strings
    vocab = {word: index for index, word in enumerate(unique_word, start=2)}
    vocab["<pad>"] = 0  # Padding token
    vocab["<unk>"] = 1  # Unknown token
    return vocab

def text_to_numerical_representation(vocab, tokenized_text):
    normalized_data = [[vocab.get(str(word), vocab["<unk>"]) for word in sentence] for sentence in tokenized_text]
    return normalized_data

tokenized_text_train = get_tokenized_text(train_text)
tokenized_text_test = get_tokenized_text(test_text)
tokenized_text_validation = get_tokenized_text(validation_text)

vocab = create_vocabulary(tokenized_text_train)

In [None]:
text_to_numerical_representation_train = text_to_numerical_representation(vocab, tokenized_text_train)
text_to_numerical_representation_test = text_to_numerical_representation(vocab, tokenized_text_test)
text_to_numerical_representation_val = text_to_numerical_representation(vocab, tokenized_text_validation)

In [None]:
text_to_numerical_representation_train = [torch.tensor(txt) for txt in text_to_numerical_representation_train]
text_to_numerical_representation_test = [torch.tensor(txt) for txt in text_to_numerical_representation_test]
text_to_numerical_representation_val = [torch.tensor(txt) for txt in text_to_numerical_representation_val]

In [None]:
print(torch.cuda.memory_allocated())
print(torch.cuda.max_memory_allocated())

In [None]:
import gc
# del train_data, test_data, val_data  # Delete variables if they exist
gc.collect()  # Python garbage collection
torch.cuda.empty_cache()

In [None]:
def encode_cat(categories, encoder):
    print("encoding")
    return torch.tensor([encoder.transform([cat])[0] if cat in encoder.classes_ else -1 for cat in categories])

In [None]:
gene_train = X_train["Gene"].tolist()
variation_train = X_train["Variation"].tolist()

gene_test = X_test["Gene"].tolist()
variation_test = X_test["Variation"].tolist()

gene_val = X_val["Gene"].tolist()
variation_val = X_val["Variation"].tolist()

- .fit() on train_data

In [None]:
encoder_gene = LabelEncoder()
encoder_variation = LabelEncoder()

In [None]:
encoder_target = LabelEncoder()
y_train = encoder_target.fit_transform(y_train)

In [None]:
y_test = encode_cat(y_test, encoder_target)
y_val = encode_cat(y_val,encoder_target)

In [None]:
gene_train_encoded = encoder_gene.fit_transform(gene_train)
variation_train_encoded = encoder_variation.fit_transform(variation_train)

In [None]:
gene_test_encoded = encode_cat(gene_test,encoder_gene)
variation_test_encoded = encode_cat(variation_test,encoder_variation)

In [None]:
gene_val_encoded = encode_cat(gene_val,encoder_gene)
variation_val_encoded = encode_cat(variation_val,encoder_variation)

- after pddding:

In [None]:
text_to_numerical_representation_train = pad_sequence(sequences=text_to_numerical_representation_train,batch_first=True,padding_value=True)
text_to_numerical_representation_test = pad_sequence(sequences=text_to_numerical_representation_test,batch_first=True,padding_value=True)
text_to_numerical_representation_val = pad_sequence(sequences=text_to_numerical_representation_val,batch_first=True,padding_value=True)

In [None]:
import gc
# del train_data, test_data, val_data  # Delete variables if they exist
gc.collect()  # Python garbage collection
torch.cuda.empty_cache()

In [None]:
print(torch.cuda.memory_allocated())
print(torch.cuda.max_memory_allocated())

#### 6.1.1 Model-LSTM

##### loading_data:

In [None]:
# Example Parameters
vocab_size = len(vocab.keys())
embed_dim = int(vocab_size ** 0.25)
hidden_dim = 128
num_classes = 9
gene_size = len(gene_train_encoded)
variance_size = len(variation_train_encoded)
gene_emb_dim = 5
variance_emb_dim = 5

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class MultiClassLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, gene_size, variance_size, gene_emb_dim, variance_emb_dim):
        super(MultiClassLSTM, self).__init__()

        # Text feature embedding + LSTM
        self.text_embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim, num_layers=2, batch_first=True)

        # Categorical feature embeddings
        self.gene_embedding = nn.Embedding(gene_size, gene_emb_dim)
        self.variance_embedding = nn.Embedding(variance_size, variance_emb_dim)
        # Fully connected layer for classification
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim + gene_emb_dim + variance_emb_dim, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, text_input, gene_input, variannce_input):
        # Process text input through embedding and LSTM
        text_embedded = self.text_embedding(text_input)
        lstm_out, _ = self.lstm(text_embedded)
        lstm_out = lstm_out[:, -1, :]  # Take the last hidden state

        # Process categorical inputs through embeddings
        gene_embedded = self.gene_embedding(gene_input).squeeze(1)
        variance_embedded = self.variance_embedding(variannce_input).squeeze(1)

        # Concatenate all features
        combined = torch.cat((lstm_out, gene_embedded, variance_embedded), dim=1)

        # Classification output
        output = self.fc(combined)
        return output


# Model Initialization
model = MultiClassLSTM(vocab_size, embed_dim, hidden_dim, num_classes, gene_size, variance_size, gene_emb_dim, variance_emb_dim)

In [None]:
print(torch.cuda.memory_allocated())
print(torch.cuda.max_memory_allocated())

import gc
# del train_data, test_data, val_data  # Delete variables if they exist
gc.collect()  # Python garbage collection
torch.cuda.empty_cache()

In [None]:
y_full_np = np.concatenate([y_train, y_test, y_val])  # Full dataset labels
unique_classes = np.unique(y_full_np)

# unique_classes = torch.tensor(y_train_tensor)
class_weights = compute_class_weight(class_weight="balanced", classes=unique_classes.cpu().numpy(), y=y_full_np)
class_weights = torch.tensor(class_weights, dtype=torch.float32, device=device)

# Define loss function with class weights
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

optimizer = optim.Adam(model.parameters(), lr=0.001)

##### Model Training:

In [None]:
class CustomDataset(Dataset):

  def __init__(self,text,gene,variance,classes):
    self.text = text
    self.gene = gene
    self.variance = variance
    self.classes = classes
    self.device = device

  def __len__(self):
    return self.text.shape[0]

  def __getitem__(self,idx):
    return self.text[idx], self.gene[idx], self.variance[idx], self.classes[idx]

In [None]:
train_dataset = CustomDataset(text_to_numerical_representation_train,gene_train_encoded,variation_train_encoded,y_train)
test_dataset = CustomDataset(text_to_numerical_representation_test,gene_test_encoded,variation_test_encoded,y_test)
val_dataset = CustomDataset(text_to_numerical_representation_val,gene_val_encoded,variation_val_encoded,y_val)

In [None]:
train_dataloader = DataLoader(train_dataset,batch_size=8,shuffle=True,)
test_dataloader = DataLoader(test_dataset,batch_size=8,shuffle=True)
validation_dataloader = DataLoader(val_dataset,batch_size=8,shuffle=True)

In [None]:
# !set CUDA_LAUNCH_BLOCKING=1

In [None]:
torch.backends.cuda.matmul.allow_tf32 = True

# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
torch.backends.cudnn.allow_tf32 = True

In [None]:
# torch.cuda.empty_cache()

In [None]:
# num_epochs = 10
# train_losses = []
# val_losses = []

# for epoch in range(num_epochs):
#     model.train()  # Set model to training mode
#     total_train_loss = 0

#     for batch in train_dataloader:
#         text_input, gene_input, variance_input, labels = batch

#         # Move to device (if using GPU)
#         # text_input = text_input.to(device)
#         print(type(text_input))
#         # # gene_input = gene_input.to(device)
#         # print(type(gene_input))
#         # # variance_input = variance_input.to(device)
#         # print(type(variance_input))
#         # # labels = labels.to(device)  # Labels should be integer class indices
#         break

In [None]:
num_epochs = 5
train_losses = []
val_losses = []

model.to(device)
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_train_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Training]"):
        text_input, gene_input, variance_input, labels = batch

        # Move to device (if using GPU)
        text_input = text_input.to(device)
        gene_input = gene_input.to(device)
        variance_input = variance_input.to(device)
        labels = labels.to(device)  # Labels should be integer class indices

        # print(text_input.device, gene_input.device, variance_input.device, labels.device)

        optimizer.zero_grad()  # Clear previous gradients

        # Forward pass
        outputs = model(text_input, gene_input, variance_input)

        # Compute Log Loss
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Compute average training loss
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # ================== Validation Phase ==================
    model.eval()  # Set model to evaluation mode
    total_val_loss = 0

    with torch.no_grad():  # No gradient calculation during validation
        for batch in tqdm(validation_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Validation]"):
            text_input, gene_input, variance_input, labels = batch
            text_input = text_input.to(device)
            gene_input = gene_input.to(device)
            variance_input = variance_input.to(device)
            labels = labels.to(device)

            outputs = model(text_input, gene_input, variance_input)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(validation_dataloader)
    val_losses.append(avg_val_loss)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

# Store losses for future use
torch.save({'train_loss': train_losses, 'val_loss': val_losses}, 'losses.pth')

- Q1. what is num_embeddings?
- Q2. what is squeeze?
- Q3. what is text_embedded.mean(dim=1)?
- Q4. what is hidden_dim in LSTM as well as sequentiial?
- Q5 why doing this lstm_out[:, -1, :]?

In [None]:
# torch.cuda.empty_cache()

In [None]:
import gc
gc.collect()  # Collect garbage
torch.cuda.empty_cache()  # Free GPU memory

In [None]:
# torch.save(text_data, "preprocessed_text.pt")

In [None]:
set(y_train) + set(y_test) + set(y_val)