# Importing the Modules

The first 4 lines will make your code reproducible.

In [1]:
#To make the code reproducible
import numpy as np
np.random.seed(0)
from tensorflow.random import set_seed
set_seed(0)

import pandas as pd
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
%matplotlib inline
print('NetworkX version: {}'.format(nx.__version__))


ModuleNotFoundError: No module named 'tensorflow'

# Loading Data

The "actual" file contains all 72 patients in the study and the labels.

In [None]:
train_df = pd.read_csv('../input/gene-expression/data_set_ALL_AML_train.csv')
test_df = pd.read_csv('../input/gene-expression/data_set_ALL_AML_independent.csv')
train_df.head()

In [None]:
test_df.head()

In [None]:
actual_df = pd.read_csv('../input/gene-expression/actual.csv')
actual_df.head()

In [None]:
print(f"Number of value counts \n{actual_df['cancer'].value_counts()}")
print(f"The shape of the actual {actual_df.shape}")

Recode label to numeric.

In [None]:
y = actual_df.replace({'ALL':0, 'AML':1})
y = y.set_index('patient')
labels = ['ALL', 'AML']

Loading training and testing datasets.

In [None]:
y["cancer"].value_counts().plot(kind='bar', )

##Feature Engineering

In [None]:
#using ACO
class ant:
    def __init__(self, G, start = 'S', end = 'T', alpha = 1, beta = 1):
        self.G = G
        self.current_city, self.target_city = start, end
        self.alpha, self.beta = alpha, beta
        self.route = {self.current_city : 0}
        self.L = 0
        
        
        def move(self):
            if self.current_city == self.target_city: return
            #Dont visit an already followed path
            possible_edges = [e for e in G.edges([self.current_city]) if e[1] not in self.route]
            possible_cities = [e[1] for e in possible_edges]
            
            distances = np.array([G.edges[e]['weight'] for e in possible_edges])
            pheromons = np.array([G.edges[e]['pheromone'] for e in possible_edges])
            
            preferences = pheromons**self.alpha/distance**self.beta
            probabilities = preferences/preferences.sum()
            
            new_city = np.random.choice(a = possible_cities, size=1,
                                       p =probabilities)[0]
            self.L += self.G.edges[(self.current_city, new_city)]['weight']
            self.current_city = new_city
            self.route[self.current_city] = len(self.route)
            
        def go():
            for i in range(self.G.number_of_nodes()):
                self.move()
        def get_path(self):
            inv_route = {v:k for k, v in self.route.items()}
            return [inv_route[i] for i in range(len(inv_route))]
class antcolony(ant):
    def __init__(self, G, start = 'S', end = 'T', alpha = 1, beta = 1):
        self.G, eps = G, 0.0001
        nx.set_edge_attributes(self.G, eps, 'pheromone')
        self.start, self.end = start, end
        self.alpha, self.beta = alpha, beta
        
    def evaporation(self, decay = 0.05):
        #Evaporation of pheromon
        phe = nx.get_edge_attribute(G, 'pheromone')
        new_phe = {k:v * (1 - decay) for k,v in phe.items()}
        nx.set_edge_attributes(self.G, new_phe, 'pheromone')
         
    def deposit(self, route, L, delta =1):
        for i,j in zip(route[:-1], route[1:]):
            self.G.edges[(i, j)]['pheromone'] += delta/L
            
    def run(self, number_of_ants =10, time=20):
        #in each time step
        for t in range(time):
            #K ants exist in the colony
            self.colony = [ant(self.G, self.start, self.end, self.alpha, self.beta)
                          for k in range(number_of_ants)]
            for k in range(number_of_ants): self.colony[k].go()
            for k in range(number_of_ants):
                self.deposit(route=self.colony[k].get_path(),
                            L = self.colony[k].L)
            self.evaporation()

From the above chart the target variable is inmbalaced needs to balance either using SMOTE, oversampling  or undersampling.


In [None]:
print("Shape of Training Data: ", train_df.shape)
print("Shape of Testing Data: ", test_df.shape)

In [None]:
train_df['call'].value_counts()

In [None]:
#The call column has no information apart from A so it wiil look into
train_to_keep=[col for col in train_df.columns if "call" not in col]
test_to_keep=[col for col in test_df.columns if "call" not in col]

X_train = train_df[train_to_keep]
X_test = test_df[test_to_keep]

Now we can simply transpose both training and testing dataframes so that genes become columns(features) and patients become rows.

In [None]:
X_train = X_train.T
X_test = X_test.T

In [None]:
ant_xtrain = X_train.values
ant_xtest = X_test.values

The first 2 rows are duplicated so we can remove 'Gene Description' and set 'Gene Accession Number' as the column headers.

In [None]:
X_train.head()

In [None]:
#X_train.columns = X_train.iloc[1]
#X_test.columns = X_test.iloc[1]

X_train = X_train.drop(['Gene Description', 'Gene Accession Number'])
X_test = X_test.drop(['Gene Description', 'Gene Accession Number'])

Neither the training and testing row indexes are not in numeric order, so it's important that we reorder these, so that the labels will line up with the corresponding data.

In [None]:
X_train.head()

In [None]:
#Sorting The index of the Transpose dataset
X_train.index = X_train.index.astype(int)
X_train.sort_index(inplace=True)

X_test.index = X_test.index.astype(int)
X_test.sort_index(inplace=True)

In [None]:
print("Shape of Training data:\t", X_train.shape)
print("Shape of Testing Data:\t", X_test.shape)

In [None]:
X_train.head()

Now let's split the target labels into training and testing targets.

In [None]:
#we define the label also taking 38 rows to follow suit our number of columns
y_train = y['cancer'][:38]
y_test = y['cancer'][38:]

## Scaling Feautures 

Note that the test set must use identical scaling to the training set.

In [None]:
X_train.describe()

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#lets do a distplot to check for skewness 
fig, ax = plt.subplots(ncols=2, figsize=(15,5))
sns.distplot(np.concatenate(X_train.values), ax=ax[0]).set_title('Original Data')
sns.distplot(np.concatenate(X_train_scaled), ax=ax[1]).set_title('Scaled Data')
plt.tight_layout
plt.show()

In [None]:
#SEE the data set is balance after scaling using the standardScaler which center and unskew the data

In [None]:
#sns.pairplot(X_train)

## Dimentionality reduction(PCA)

You need to apply the same PCA on training and testing sets.

In [None]:
pca = PCA(n_components = 0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(X_train_pca.shape)
print(X_test_pca.shape)

In [None]:
var = pca.explained_variance_ratio_
plt.plot(var)
plt.xlabel('Principal component index')
plt.ylabel('Explained variance ratio')
plt.show()

In [None]:
# print(len(pca.components_))
pca.explained_variance_ratio_.sum()

95% of variance is explained by 32 principal components. We can't plot something in 32 dimensions, so let's just see what the PCA looks like when we just pick the top three compoments.

In [None]:
pca3 = PCA(n_components = 3).fit_transform(X_train_scaled)
colors = np.where(y_train==0, 'red', 'blue')
plt.clf()
fig = plt.figure(1, figsize=(10,6 ))
#ax = Axes3D(fig, elev=-150, azim=110,)
sns.scatterplot(pca3[:, 0], pca3[:, 1], pca3[:, 2])
plt.show()

# Neural Network

In [None]:
NN_model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=X_train_pca[1].shape),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

In [None]:
NN_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['binary_accuracy']
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.005,
    restore_best_weights=True,
)

In [None]:
train_history = NN_model.fit(
    X_train_pca, y_train,
    validation_data=(X_test_pca, y_test),
    batch_size = 8,
    epochs = 200,
    callbacks=[early_stopping]
)

In [None]:
pred = NN_model.predict_classes(X_test_pca)
print('Neural Network accuracy: ', round(accuracy_score(y_test, pred), 3))

In [None]:
cm_nn = confusion_matrix(y_test, pred)

ax = plt.subplot()
sns.heatmap(cm_nn, annot=True, ax = ax, fmt='g', cmap='Greens') 

# Labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels') 
ax.set_title('Neural Network Confusion Matrix') 
ax.xaxis.set_ticklabels(labels) 
ax.yaxis.set_ticklabels(labels, rotation=360);

In [None]:
ant_train = train_df.copy() 
ant_test = train_df.copy()

tryin to optimize the dataset using antcolony optimaization

In [None]:
#using ACO
class ant():
    def __init__(self, G, start = 'S', end = 'T', alpha = 1, beta = 1):
        self.G = G
        self.current_city, self.target_city = start, end
        self.alpha, self.beta = alpha, beta
        self.route = {self.current_city : 0}
        self.L = 0
        
        
        def move(self):
            if self.current_city == self.target_city: return
            #Dont visit an already followed path
            possible_edges = [e for e in G.edges([self.current_city]) if e[1] not in self.route]
            possible_cities = [e[1] for e in possible_edges]
            
            distances = np.array([G.edges[e]['weight'] for e in possible_edges])
            pheromons = np.array([G.edges[e]['pheromone'] for e in possible_edges])
            
            preferences = pheromons**self.alpha/distance**self.beta
            probabilities = preferences/preferences.sum()
            
            new_city = np.random.choice(a = possible_cities, size=1,
                                       p =probabilities)[0]
            self.L += self.G.edges[(self.current_city, new_city)]['weight']
            self.current_city = new_city
            self.route[self.current_city] = len(self.route)
            
        def go():
            for i in range(self.G.number_of_nodes()):
                self.move()
        def get_path(self):
            inv_route = {v:k for k, v in self.route.items()}
            return [inv_route[i] for i in range(len(inv_route))]
class antcolony():
    def __init__(self, G, start = 'S', end = 'T', alpha = 1, beta = 1):
        self.G, eps = G, 0.0001
        nx.set_edge_attributes(self.G, eps, 'pheromone')
        self.start, self.end = start, end
        self.alpha, self.beta = alpha, beta
        
    def evaporation(self, decay = 0.05):
        #Evaporation of pheromon
        phe = nx.get_edge_attribute(G, 'pheromone')
        new_phe = {k:v * (1 - decay) for k,v in phe.items()}
        nx.set_edge_attributes(self.G, new_phe, 'pheromone')
         
    def deposit(self, route, L, delta =1):
        for i,j in zip(route[:-1], route[1:]):
            self.G.edges[(i, j)]['pheromone'] += delta/L
            
    def run(self, number_of_ants =10, time=20):
        #in each time step
        for t in range(time):
            #K ants exist in the colony
            self.colony = [ant(self.G, self.start, self.end, self.alpha, self.beta)
                          for k in range(number_of_ants)]
            #for k in range(number_of_ants): self.colony[k].go()
            #for k in range(number_of_ants):
                #self.deposit(route=self.colony[k].get_path(),
                            #L = self.colony[k].L)
            self.evaporation()

In [None]:
edge_list = [('S', 'A', 1), ('A', 'T', 1), ('S', 'B', 10), ('B', 'T', 10)]
G = nx.Graph()
G.add_weighted_edges_from(edge_list)

swarm = antcolony(G, start='S', end='T', alpha=1, beta = 1)
swarm.run(number_of_ants = 10, time = 2)