# Import necessary libararies

In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
from random import randint
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, QuantileTransformer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.feature_selection import VarianceThreshold, SelectFromModel

# Import classifiers

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC, LinearSVC

# Implemented Tools

In [3]:
def majority_element(num_list):
    idx, ctr = 0, 1
        
    for i in range(1, len(num_list)):
        if num_list[idx] == num_list[i]:
            ctr += 1
        else:
            ctr -= 1
            if ctr == 0:
                idx = i
                ctr = 1
        
    return num_list[idx]

def make_prediction(pred):
    predicted = []
    for i in range(len(pred[0])):
        lst2 = [item[i] for item in pred]
        predicted.append(majority_element(lst2))
    return predicted

def get_estimators_predictions(x, y, classifier, test, y_test, num_of_f, original_N_F=27, n_estimators = None, selective = None):
    estimators = []
    predictions = []
    accuracies = []
    max_acc = 0
    best_X_test = None
    selective_flag = False
    if selective is not None:
        X_train_features = []
        selective_flag = True
    features = x.columns.values
    if n_estimators is None:
        n_estimators = len(test)
    for i in range(n_estimators):
        f = []
        myset = set()
        for j in range(original_N_F-num_of_f):
            f_num = randint(0,len(features)-num_of_f)
            while myset.issuperset([f_num]):
                f_num = f_num + 1
                if f_num > original_N_F-num_of_f:
                    f_num = 0
            myset.add(f_num)
            f.append(features[f_num])
        x_copy = deepcopy(x)
        test_copy = deepcopy(test)
        for fi in f:
            x_copy = x_copy.drop(fi, axis=1)
            test_copy = test_copy.drop(fi, axis=1)
        clf = classifier
        clf = clf.fit(x_copy, y.values.ravel())
        estimators.append(deepcopy(clf))
        clf_predictions = clf.predict(test_copy)
        predictions.append(clf_predictions)
        acc_score = accuracy_score(clf_predictions, y_test)
        accuracies.append(acc_score)
        if selective_flag :
            if acc_score > selective :
                X_train_features.append(x_copy.columns.values)
        if acc_score > max_acc:
            max_acc = acc_score
            best_X_test = x_copy
            best_estimator = deepcopy(clf)
    if selective_flag :
        return predictions, accuracies, max_acc, estimators, best_estimator, best_X_test, X_train_features
    return predictions, accuracies, max_acc, estimators, best_estimator, best_X_test

def estimate_on_portions(x, y, classifier, n, x_test, y_test):
    best_est = None
    estimators = []
    acc = []
    max_acc = 0
    pred = []
    interval = int(len(x)/n)
    lim1 = 0
    lim2 = interval
    for i in range(n):
        portion_x = x[lim1:lim2]
        portion_y = y[lim1:lim2]
        clf = classifier.fit(portion_x, portion_y.values.ravel())
        predicted = clf.predict(x_test)
        pred.append(predicted)
        current_acc = accuracy_score(y_test, predicted)
        if current_acc > max_acc:
            max_acc = current_acc
            best_est = clf  
        acc.append(current_acc)
        estimators.append(clf)
        lim1 = lim1 + interval
        lim2 = lim2 + interval
    return max_acc, acc, best_est, estimators, pred

def discretization(x, feature, ranges):
    data = deepcopy(x)
    for i in range(len(ranges)):
        for j in range(len(data)):
            if ranges[i][0] <= data[feature][j] < ranges[i][1]:
                data[feature][j] = ranges[i][2]
    return data


def discretization_HGB(x, male_ranges, female_ranges):
    data = deepcopy(x)
    for i in range(len(male_ranges)):
        for j in range(len(data)):
            if data['Gender'][j] == 1:
                if male_ranges[i][0] <= data['HGB'][j] < male_ranges[i][1]:
                    data['HGB'][j] = male_ranges[i][2]
            elif data['Gender'][j] == 2:
                if female_ranges[i][0] <= data['HGB'][j] < female_ranges[i][1]:
                    data['HGB'][j] = female_ranges[i][2]
    return data

def drop_class(x,y, c):
    result = deepcopy(y)
    data = deepcopy(x)
    for i in range(len(data)):
        if y[0][i] == c:
            data = data.drop(i, axis = 0)
            result = result.drop(i, axis=0)
    return data.reset_index(), result.reset_index()

# Reading the data

In [42]:
data = pd.read_csv("D:\Bachelor's final year\Second Semester\ML\project\HCV-Egy-Data.csv")
y_classes = pd.DataFrame(np.asarray(data['Baselinehistological staging']))
data = data.drop('Baselinehistological staging', axis=1)

# Discretization

In [None]:
age_ranges = [[0, 32, 30], [32, 37, 35], [37, 42, 40], [42, 47, 45], [47, 52, 50], [52, 57, 55], [57, 63, 60]]
data = discretization(data, 'Age', age_ranges)

ast1_ranges = [[0, 20, 10], [20, 40, 30], [40, 129, 100]]
data = discretization(data, 'AST 1', ast1_ranges)

bmi_ranges = [[0, 18.5, 15], [185, 25, 20], [25, 30, 27], [30, 35, 33], [35, 41, 37]]
data = discretization(data, 'BMI', bmi_ranges)

wbc_ranges = [[0, 4000, 2000], [4000, 11000, 8000], [11000, 12102, 10000]]
data = discretization(data, 'WBC', wbc_ranges)

rbc_ranges = [[0, 3000000, 2000000], [3000000, 5000000, 4000000], [5000000, 5018452, 500500]]
data = discretization(data, 'RBC', rbc_ranges)

plat_ranges = [[93013, 100000, 95000], [100000, 255000, 200000], [255000, 226466, 226000]]
data = discretization(data, 'Plat', plat_ranges)

data = discretization(data, 'ALT 1', ast1_ranges)
data = discretization(data, 'ALT4', ast1_ranges)
data = discretization(data, 'ALT 12', ast1_ranges)
data = discretization(data, 'ALT 24', ast1_ranges)
data = discretization(data, 'ALT 36', ast1_ranges)
data = discretization(data, 'ALT 48', ast1_ranges)

rnabase_ranges = [[0, 5, 3], [5, 1201087, 10]]
data = discretization(data, 'RNA Base', rnabase_ranges)

rna4_ranges = [[0, 5, 3], [5, 1201716, 10]]
data = discretization(data, 'RNA 4', rna4_ranges)

rna12_ranges = [[0, 5, 3], [5, 3731528, 10]]
data = discretization(data, 'RNA 12', rna12_ranges)

rnaeot_ranges = [[0, 5, 3], [5, 808451, 10]]
data = discretization(data, 'RNA EOT', rnaeot_ranges)

data = discretization(data, 'RNA EF', rnaeot_ranges)

HGB_male_ranges = [[2, 14, 10], [14, 17.5, 15], [17.5, 21, 19]]
HGB_female_ranges = [[2, 12.3, 10], [12.3, 15.3, 15], [15.3, 21, 19]]
data = discretization_HGB(data, HGB_male_ranges, HGB_female_ranges)

# Experiments

## Droping classes

In [None]:
data_without_3, y_without_3 = drop_class(data,y_classes, 3)
data_without_34, y_without_34 = drop_class(data_without_3,y_without_3, 4)
data = data_without_34.drop('level_0', axis=1)
y_classes = y_without_34.drop('level_0', axis=1)
data = data.drop('index', axis=1)
y_classes = y_classes.drop('index', axis=1)

## Droping features

In [43]:
data = data.drop('Age', axis = 1)
data = data.drop('Gender', axis = 1)
#data = data.drop('BMI', axis = 1)
data = data.drop('Headache ', axis = 1)
data = data.drop('Diarrhea ', axis = 1)
data = data.drop('Fatigue & generalized bone ache ', axis = 1)
data = data.drop('Jaundice ', axis = 1)
data = data.drop('WBC', axis = 1)
data = data.drop('RBC', axis = 1)
data = data.drop('HGB', axis = 1)
data = data.drop('AST 1', axis = 1)
data = data.drop('ALT 1', axis = 1)
data = data.drop('ALT4', axis = 1)
#data = data.drop('ALT 12', axis = 1)
data = data.drop('ALT 24', axis = 1)
data = data.drop('ALT 36', axis = 1)
data = data.drop('ALT 48', axis = 1)
#data = data.drop('ALT after 24 w', axis = 1)
#data = data.drop('RNA Base', axis = 1)
#data = data.drop('RNA 4', axis = 1)
#data = data.drop('RNA 12', axis = 1)
#data = data.drop('RNA EOT', axis = 1)
#data = data.drop('RNA EF', axis = 1)
data = data.drop('Fever', axis = 1)
data = data.drop('Nausea/Vomting', axis = 1)
#data = data.drop('Epigastric pain ', axis = 1)
#data = data.drop('Plat', axis = 1)
#data = data.drop('Baseline histological Grading', axis = 1)

## Pre-processing

In [44]:
quantile_transformer = QuantileTransformer(output_distribution='uniform', random_state=0)
data = quantile_transformer.fit_transform(data)
data = pd.DataFrame(data)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,1.0,0.155299,0.780781,0.0,0.552779,0.529544,0.544244,0.0,0.0,0.727227
1,0.525025,0.0,0.287801,0.405906,0.935435,0.038638,0.446945,0.846832,0.587074,0.301301,0.098098
2,0.805305,0.0,0.453402,0.757257,0.0,0.47755,0.544566,0.0,0.93739,0.778877,0.098098
3,0.805305,0.0,0.408275,0.454454,0.477978,0.859303,0.383259,0.804541,0.94895,0.796545,0.501001
4,0.735235,1.0,0.729734,0.108108,0.350851,0.556457,0.607613,1.0,0.588679,0.496254,0.575576


## Polynomial features

In [45]:
data = PolynomialFeatures(4).fit_transform(data)
data = pd.DataFrame(data)
data = data.drop(0, axis = 1)
print('classes length = ' + str(len(y_classes)) +'  data length = ' + str(len(data)))
data.head()

classes length = 1385  data length = 1385


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1355,1356,1357,1358,1359,1360,1361,1362,1363,1364
0,1.0,1.0,0.155299,0.780781,0.0,0.552779,0.529544,0.544244,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.279692
1,0.525025,0.0,0.287801,0.405906,0.935435,0.038638,0.446945,0.846832,0.587074,0.301301,...,0.003317,0.016058,0.005228,0.001702,0.000554,0.008241,0.002683,0.000874,0.000284,9.3e-05
2,0.805305,0.0,0.453402,0.757257,0.0,0.47755,0.544566,0.0,0.93739,0.778877,...,0.008456,0.442921,0.055785,0.007026,0.000885,0.368023,0.046352,0.005838,0.000735,9.3e-05
3,0.805305,0.0,0.408275,0.454454,0.477978,0.859303,0.383259,0.804541,0.94895,0.796545,...,0.226029,0.479595,0.30165,0.189728,0.119333,0.40257,0.253203,0.159257,0.100167,0.063002
4,0.735235,1.0,0.729734,0.108108,0.350851,0.556457,0.607613,1.0,0.588679,0.496254,...,0.114805,0.071943,0.083443,0.09678,0.11225,0.060648,0.070342,0.081585,0.094626,0.109751


## Feature selection

In [46]:
data = pd.DataFrame(data)
describtion = data.describe()
std_dev = np.asarray(describtion.iloc[[2]])
threshold_std_dev = 0.5 * np.max(std_dev)
sel = VarianceThreshold(threshold=(threshold_std_dev**2))
new_data = pd.DataFrame(sel.fit_transform(data))
new_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84,85,86,87,88,89,90,91,92,93
0,1.0,1.0,0.155299,0.780781,0.0,0.552779,0.529544,0.544244,0.0,0.0,...,0.528859,0.000582,0.371635,0.0,0.09337,0.078634,0.087735,0.0,0.0,0.279692
1,0.525025,0.0,0.287801,0.405906,0.935435,0.038638,0.446945,0.846832,0.587074,0.301301,...,0.0,0.006861,0.027146,0.765694,2e-06,0.039904,0.514267,0.118788,0.008241,9.3e-05
2,0.805305,0.0,0.453402,0.757257,0.0,0.47755,0.544566,0.0,0.93739,0.778877,...,0.0,0.04226,0.328832,0.0,0.052009,0.087943,0.0,0.772114,0.368023,9.3e-05
3,0.805305,0.0,0.408275,0.454454,0.477978,0.859303,0.383259,0.804541,0.94895,0.796545,...,0.0,0.027785,0.042654,0.052195,0.545238,0.021576,0.418979,0.810912,0.40257,0.063002
4,0.735235,1.0,0.729734,0.108108,0.350851,0.556457,0.607613,1.0,0.588679,0.496254,...,0.331287,0.283569,0.000137,0.015153,0.09588,0.136304,1.0,0.120092,0.060648,0.109751


In [47]:
data = new_data

# Train-Test split

In [48]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
sss.get_n_splits(data, y_classes)
for train_index, test_index in sss.split(data, y_classes):
    X_train, X_test = data.iloc[train_index], data.iloc[test_index]
    y_train, y_test = y_classes.iloc[train_index], y_classes.iloc[test_index]

# FItting and Getting results

In [81]:
pred, acc, max_acc, estimators, best_est, best_X_test = get_estimators_predictions(X_train, y_train,
                                        SVC(gamma = 1, kernel = 'linear'),
                                        X_test, y_test, 1, 94, n_estimators=30)
predicted = make_prediction(pred)
best_X_test.columns.values

array([32], dtype=int64)

In [82]:
print('max accuracy = ' + str(max_acc))
print ('majority accuracy = ' + str(accuracy_score(predicted, y_test)))

max accuracy = 0.2924187725631769
majority accuracy = 0.26353790613718414


In [None]:
X_test_new = X_test
for i in range(0, len(X_test.columns.values)):
    if i == 28:
        continue
    else :
        X_test_new = X_test_new.drop(i ,axis=1)
new_predicted = best_est.predict(X_test_new)
print ('best estimator accuracy = ' + str(accuracy_score(new_predicted, y_test)))

In [14]:
np.sort(acc)

array([0.20938628, 0.20938628, 0.20938628, 0.20938628, 0.20938628,
       0.20938628, 0.21299639, 0.21299639, 0.21299639, 0.21299639,
       0.21299639, 0.21299639, 0.21299639, 0.21299639, 0.21299639,
       0.21299639, 0.21299639, 0.21299639, 0.21299639, 0.2166065 ,
       0.2166065 , 0.2166065 , 0.2166065 , 0.2166065 , 0.2166065 ,
       0.2166065 , 0.22382671, 0.22382671, 0.22382671, 0.22382671,
       0.22382671, 0.23104693, 0.23104693, 0.23465704, 0.23465704,
       0.23465704, 0.23465704, 0.23465704, 0.23465704, 0.23826715,
       0.23826715, 0.23826715, 0.23826715, 0.23826715, 0.23826715,
       0.23826715, 0.23826715, 0.23826715, 0.23826715, 0.23826715,
       0.23826715, 0.23826715, 0.23826715, 0.23826715, 0.23826715,
       0.23826715, 0.23826715, 0.23826715, 0.23826715, 0.23826715,
       0.23826715, 0.23826715, 0.23826715, 0.23826715, 0.23826715,
       0.23826715, 0.23826715, 0.23826715, 0.24187726, 0.24187726,
       0.24187726, 0.24187726, 0.24187726, 0.24187726, 0.24909