# Import necessary libararies

In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
from random import randint
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.feature_selection import VarianceThreshold, SelectFromModel

# Import classifiers

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC, LinearSVC

# Implemented Tools

In [3]:
def majority_element(num_list):
    idx, ctr = 0, 1
        
    for i in range(1, len(num_list)):
        if num_list[idx] == num_list[i]:
            ctr += 1
        else:
            ctr -= 1
            if ctr == 0:
                idx = i
                ctr = 1
        
    return num_list[idx]

def make_prediction(pred):
    predicted = []
    for i in range(len(pred[0])):
        lst2 = [item[i] for item in pred]
        predicted.append(majority_element(lst2))
    return predicted

def get_estimators_predictions(x, y, classifier, test, y_test, num_of_f, original_N_F=27, n_estimators = None, selective = None):
    estimators = []
    predictions = []
    accuracies = []
    max_acc = 0
    best_X_test = None
    selective_flag = False
    if selective is not None:
        X_train_features = []
        selective_flag = True
    features = x.columns.values
    if n_estimators is None:
        n_estimators = len(test)
    for i in range(n_estimators):
        f = []
        myset = set()
        for j in range(original_N_F-num_of_f):
            f_num = randint(0,len(features)-num_of_f)
            while myset.issuperset([f_num]):
                f_num = f_num + 1
                if f_num > original_N_F-num_of_f:
                    f_num = 0
            myset.add(f_num)
            f.append(features[f_num])
        x_copy = deepcopy(x)
        test_copy = deepcopy(test)
        for fi in f:
            x_copy = x_copy.drop(fi, axis=1)
            test_copy = test_copy.drop(fi, axis=1)
        clf = classifier
        clf = clf.fit(x_copy, y.values.ravel())
        estimators.append(deepcopy(clf))
        clf_predictions = clf.predict(test_copy)
        predictions.append(clf_predictions)
        acc_score = accuracy_score(clf_predictions, y_test)
        accuracies.append(acc_score)
        if selective_flag :
            if acc_score > selective :
                X_train_features.append(x_copy.columns.values)
        if acc_score > max_acc:
            max_acc = acc_score
            best_X_test = x_copy
            best_estimator = deepcopy(clf)
    if selective_flag :
        return predictions, accuracies, max_acc, estimators, best_estimator, best_X_test, X_train_features
    return predictions, accuracies, max_acc, estimators, best_estimator, best_X_test

def estimate_on_portions(x, y, classifier, n, x_test, y_test):
    best_est = None
    estimators = []
    acc = []
    max_acc = 0
    pred = []
    interval = int(len(x)/n)
    lim1 = 0
    lim2 = interval
    for i in range(n):
        portion_x = x[lim1:lim2]
        portion_y = y[lim1:lim2]
        clf = classifier.fit(portion_x, portion_y.values.ravel())
        predicted = clf.predict(x_test)
        pred.append(predicted)
        current_acc = accuracy_score(y_test, predicted)
        if current_acc > max_acc:
            max_acc = current_acc
            best_est = clf  
        acc.append(current_acc)
        estimators.append(clf)
        lim1 = lim1 + interval
        lim2 = lim2 + interval
    return max_acc, acc, best_est, estimators, pred

def discretization(x, feature, ranges):
    data = deepcopy(x)
    for i in range(len(ranges)):
        for j in range(len(data)):
            if ranges[i][0] <= data[feature][j] < ranges[i][1]:
                data[feature][j] = ranges[i][2]
    return data


def discretization_HGB(x, male_ranges, female_ranges):
    data = deepcopy(x)
    for i in range(len(male_ranges)):
        for j in range(len(data)):
            if data['Gender'][j] == 1:
                if male_ranges[i][0] <= data['HGB'][j] < male_ranges[i][1]:
                    data['HGB'][j] = male_ranges[i][2]
            elif data['Gender'][j] == 2:
                if female_ranges[i][0] <= data['HGB'][j] < female_ranges[i][1]:
                    data['HGB'][j] = female_ranges[i][2]
    return data

def drop_class(x,y, c):
    result = deepcopy(y)
    data = deepcopy(x)
    for i in range(len(data)):
        if y[0][i] == c:
            data = data.drop(i, axis = 0)
            result = result.drop(i, axis=0)
    return data.reset_index(), result.reset_index()

# Reading the data

In [8]:
data = pd.read_csv("D:\Bachelor's final year\Second Semester\ML\project\HCV-Egy-Data.csv")
y_classes = pd.DataFrame(np.asarray(data['Baselinehistological staging']))
data = data.drop('Baselinehistological staging', axis=1)

# Experiments

## Droping features

In [9]:
data = data.drop('Age', axis = 1)
data = data.drop('Gender', axis = 1)
#data = data.drop('BMI', axis = 1)
data = data.drop('Headache ', axis = 1)
data = data.drop('Diarrhea ', axis = 1)
data = data.drop('Fatigue & generalized bone ache ', axis = 1)
data = data.drop('Jaundice ', axis = 1)
data = data.drop('WBC', axis = 1)
data = data.drop('RBC', axis = 1)
data = data.drop('HGB', axis = 1)
data = data.drop('AST 1', axis = 1)
data = data.drop('ALT 1', axis = 1)
data = data.drop('ALT4', axis = 1)
#data = data.drop('ALT 12', axis = 1)
data = data.drop('ALT 24', axis = 1)
data = data.drop('ALT 36', axis = 1)
data = data.drop('ALT 48', axis = 1)
#data = data.drop('ALT after 24 w', axis = 1)
#data = data.drop('RNA Base', axis = 1)
#data = data.drop('RNA 4', axis = 1)
#data = data.drop('RNA 12', axis = 1)
#data = data.drop('RNA EOT', axis = 1)
#data = data.drop('RNA EF', axis = 1)
data = data.drop('Fever', axis = 1)
data = data.drop('Nausea/Vomting', axis = 1)
#data = data.drop('Epigastric pain ', axis = 1)
#data = data.drop('Plat', axis = 1)
#data = data.drop('Baseline histological Grading', axis = 1)

## Pre-processing

In [10]:
data = pd.DataFrame(MinMaxScaler().fit_transform(data))

## Polynomial features

In [11]:
data = PolynomialFeatures(4).fit_transform(data)
data = pd.DataFrame(data)
data = data.drop(0, axis = 1)
print('classes length = ' + str(len(y_classes)) +'  data length = ' + str(len(data)))
data.head()

classes length = 1385  data length = 1385


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1355,1356,1357,1358,1359,1360,1361,1362,1363,1364
0,1.0,1.0,0.143266,0.786517,0.0,0.54561,0.528023,0.077231,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.350128
1,0.538462,0.0,0.272415,0.404494,0.975,0.033811,0.44822,0.170721,0.416601,0.038355,...,0.001027,2.4e-05,4.7e-05,9.5e-05,0.00019,2e-06,4e-06,9e-06,1.7e-05,3.5e-05
2,0.846154,0.0,0.438431,0.764045,0.0,0.475522,0.550333,0.0,0.910315,0.689627,...,0.004903,0.298562,0.033303,0.003715,0.000414,0.226181,0.025229,0.002814,0.000314,3.5e-05
3,0.846154,0.0,0.400477,0.460674,0.7,0.867498,0.374411,0.156956,0.920852,0.718593,...,0.245861,0.341695,0.256041,0.191859,0.143765,0.266644,0.199804,0.149718,0.112188,0.084066
4,0.769231,1.0,0.709406,0.101124,0.625,0.54984,0.61475,1.0,0.419251,0.299701,...,0.066564,0.011286,0.023174,0.047583,0.097704,0.008068,0.016566,0.034015,0.069844,0.143412


## Feature selection

In [18]:
data = pd.DataFrame(data)
describtion = data.describe()
std_dev = np.asarray(describtion.iloc[[2]])
threshold_std_dev = 0.5 * np.max(std_dev)
sel = VarianceThreshold(threshold=(threshold_std_dev**2))
new_data = pd.DataFrame(sel.fit_transform(data))
new_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,82,83,84,85,86,87,88,89,90,91
0,1.0,1.0,0.143266,0.786517,0.54561,0.528023,0.0,0.0,0.769231,1.0,...,0.591716,0.0,0.147218,0.455166,0.000421,0.382677,0.0,0.08862,0.077734,0.350128
1,0.538462,0.0,0.272415,0.404494,0.033811,0.44822,0.416601,0.038355,0.076923,0.289941,...,0.0,0.0,0.0,0.0,0.005507,0.02677,0.903688,1e-06,0.040361,3.5e-05
2,0.846154,0.0,0.438431,0.764045,0.475522,0.550333,0.910315,0.689627,0.076923,0.715976,...,0.0,0.0,0.0,0.0,0.036949,0.340781,0.0,0.051131,0.091728,3.5e-05
3,0.846154,0.0,0.400477,0.460674,0.867498,0.374411,0.920852,0.718593,0.538462,0.715976,...,0.0,0.0,0.0,0.0,0.025722,0.045038,0.2401,0.566335,0.019652,0.084066
4,0.769231,1.0,0.709406,0.101124,0.54984,0.61475,0.419251,0.299701,0.615385,0.591716,...,0.378698,0.244141,0.232325,0.233045,0.253268,0.000105,0.152588,0.0914,0.142822,0.143412


In [19]:
data = new_data

# Train-Test split

In [20]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
sss.get_n_splits(data, y_classes)
for train_index, test_index in sss.split(data, y_classes):
    X_train, X_test = data.iloc[train_index], data.iloc[test_index]
    y_train, y_test = y_classes.iloc[train_index], y_classes.iloc[test_index]

# FItting and Getting results

In [47]:
pred, acc, max_acc, estimators, best_est, best_X_test = get_estimators_predictions(X_train, y_train,
                                        LinearSVC(),  
                                        X_test, y_test, 1, 92, n_estimators=100)
predicted = make_prediction(pred)
best_X_test.columns.values

array([28], dtype=int64)

In [48]:
print('max accuracy = ' + str(max_acc))
print ('majority accuracy = ' + str(accuracy_score(predicted, y_test)))

max accuracy = 0.33935018050541516
majority accuracy = 0.2815884476534296


In [51]:
X_test_new = X_test
for i in range(0, len(X_test.columns.values)):
    if i == 28:
        continue
    else :
        X_test_new = X_test_new.drop(i ,axis=1)
new_predicted = best_est.predict(X_test_new)
print ('best estimator accuracy = ' + str(accuracy_score(new_predicted, y_test)))

best estimator accuracy = 0.33935018050541516


In [None]:
np.sort(acc)