# Import necessary libararies

In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
from random import randint
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.feature_selection import VarianceThreshold

# Import classifiers

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Implemented Tools

In [3]:
def majority_element(num_list):
    idx, ctr = 0, 1
        
    for i in range(1, len(num_list)):
        if num_list[idx] == num_list[i]:
            ctr += 1
        else:
            ctr -= 1
            if ctr == 0:
                idx = i
                ctr = 1
        
    return num_list[idx]

def make_prediction(pred):
    predicted = []
    for i in range(len(pred[0])):
        lst2 = [item[i] for item in pred]
        predicted.append(majority_element(lst2))
    return predicted

def get_estimators_predictions(x, y, classifier, test, y_test, num_of_f, original_N_F=27, n_estimators = None, selective = None):
    estimators = []
    predictions = []
    accuracies = []
    max_acc = 0
    best_X_test = None
    selective_flag = False
    if selective is not None:
        X_train_features = []
        selective_flag = True
    features = x.columns.values
    if n_estimators is None:
        n_estimators = len(test)
    for i in range(n_estimators):
        f = []
        myset = set()
        for j in range(original_N_F-num_of_f):
            f_num = randint(0,len(features)-num_of_f)
            while myset.issuperset([f_num]):
                f_num = f_num + 1
                if f_num > original_N_F-num_of_f:
                    f_num = 0
            myset.add(f_num)
            f.append(features[f_num])
        x_copy = deepcopy(x)
        test_copy = deepcopy(test)
        for fi in f:
            x_copy = x_copy.drop(fi, axis=1)
            test_copy = test_copy.drop(fi, axis=1)
        clf = classifier
        clf = clf.fit(x_copy, y.values.ravel())
        estimators.append(deepcopy(clf))
        clf_predictions = clf.predict(test_copy)
        predictions.append(clf_predictions)
        acc_score = accuracy_score(clf_predictions, y_test)
        accuracies.append(acc_score)
        if selective_flag :
            if acc_score > selective :
                X_train_features.append(x_copy.columns.values)
        if acc_score > max_acc:
            max_acc = acc_score
            best_X_test = x_copy
            best_estimator = deepcopy(clf)
    if selective_flag :
        return predictions, accuracies, max_acc, estimators, best_estimator, best_X_test, X_train_features
    return predictions, accuracies, max_acc, estimators, best_estimator, best_X_test

def estimate_on_portions(x, y, classifier, n, x_test, y_test):
    best_est = None
    estimators = []
    acc = []
    max_acc = 0
    pred = []
    interval = int(len(x)/n)
    lim1 = 0
    lim2 = interval
    for i in range(n):
        portion_x = x[lim1:lim2]
        portion_y = y[lim1:lim2]
        clf = classifier.fit(portion_x, portion_y.values.ravel())
        predicted = clf.predict(x_test)
        pred.append(predicted)
        current_acc = accuracy_score(y_test, predicted)
        if current_acc > max_acc:
            max_acc = current_acc
            best_est = clf  
        acc.append(current_acc)
        estimators.append(clf)
        lim1 = lim1 + interval
        lim2 = lim2 + interval
    return max_acc, acc, best_est, estimators, pred

def discretization(x, feature, ranges):
    data = deepcopy(x)
    for i in range(len(ranges)):
        for j in range(len(data)):
            if ranges[i][0] <= data[feature][j] < ranges[i][1]:
                data[feature][j] = ranges[i][2]
    return data


def discretization_HGB(x, male_ranges, female_ranges):
    data = deepcopy(x)
    for i in range(len(male_ranges)):
        for j in range(len(data)):
            if data['Gender'][j] == 1:
                if male_ranges[i][0] <= data['HGB'][j] < male_ranges[i][1]:
                    data['HGB'][j] = male_ranges[i][2]
            elif data['Gender'][j] == 2:
                if female_ranges[i][0] <= data['HGB'][j] < female_ranges[i][1]:
                    data['HGB'][j] = female_ranges[i][2]
    return data

def drop_class(x,y, c):
    result = deepcopy(y)
    data = deepcopy(x)
    for i in range(len(data)):
        if y[0][i] == c:
            data = data.drop(i, axis = 0)
            result = result.drop(i, axis=0)
    return data.reset_index(), result.reset_index()

# Reading the data

In [11]:
data = pd.read_csv("D:\Bachelor's final year\Second Semester\ML\project\HCV-Egy-Data.csv")
y_classes = pd.DataFrame(np.asarray(data['Baselinehistological staging']))
data = data.drop('Baselinehistological staging', axis=1)

# Experiments

## Droping features

In [12]:
data = data.drop('Age', axis = 1)
data = data.drop('Gender', axis = 1)
data = data.drop('BMI', axis = 1)
data = data.drop('Headache ', axis = 1)
data = data.drop('Diarrhea ', axis = 1)
data = data.drop('Fatigue & generalized bone ache ', axis = 1)
data = data.drop('Jaundice ', axis = 1)
data = data.drop('WBC', axis = 1)
data = data.drop('RBC', axis = 1)
data = data.drop('HGB', axis = 1)
data = data.drop('AST 1', axis = 1)
data = data.drop('ALT 1', axis = 1)
data = data.drop('ALT4', axis = 1)
#data = data.drop('ALT 12', axis = 1)
data = data.drop('ALT 24', axis = 1)
data = data.drop('ALT 36', axis = 1)
data = data.drop('ALT 48', axis = 1)
data = data.drop('ALT after 24 w', axis = 1)
data = data.drop('RNA Base', axis = 1)
data = data.drop('RNA 4', axis = 1)
#data = data.drop('RNA 12', axis = 1)
#data = data.drop('RNA EOT', axis = 1)
#data = data.drop('RNA EF', axis = 1)
data = data.drop('Fever', axis = 1)
data = data.drop('Nausea/Vomting', axis = 1)
data = data.drop('Epigastric pain ', axis = 1)
data = data.drop('Plat', axis = 1)
#data = data.drop('Baseline histological Grading', axis = 1)

## Polynomial features

In [13]:
data = PolynomialFeatures(5).fit_transform(data)
data = pd.DataFrame(data)
data = data.drop(0, axis = 1)
print('classes length = ' + str(len(y_classes)) +'  data length = ' + str(len(data)))
data.head()

classes length = 1385  data length = 1385


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,242,243,244,245,246,247,248,249,250,251
0,109.0,288194.0,5.0,5.0,13.0,11881.0,31413146.0,545.0,545.0,1417.0,...,8125.0,21125.0,54925.0,142805.0,3125.0,8125.0,21125.0,54925.0,142805.0,371293.0
1,75.0,637056.0,336804.0,31085.0,4.0,5625.0,47779200.0,25260300.0,2331375.0,300.0,...,4.046596e+19,5207137000000000.0,670051300000.0,86221820.0,2.902381e+22,3.734767e+18,480587600000000.0,61841740000.0,7957760.0,1024.0
2,107.0,5.0,735945.0,558829.0,4.0,11449.0,535.0,78746115.0,59794703.0,428.0,...,5.137386e+23,3.67725e+18,26321110000000.0,188401900.0,5.449977e+28,3.900998e+23,2.792266e+18,19986550000000.0,143060200.0,1024.0
3,80.0,585688.0,744463.0,582301.0,10.0,6400.0,46855040.0,59557040.0,46584080.0,800.0,...,1.469893e+24,2.524284e+19,433501500000000.0,7444630000.0,6.694801e+28,1.149715e+24,1.974434e+19,339074500000000.0,5823010000.0,100000.0
4,48.0,3731527.0,338946.0,242861.0,11.0,2304.0,179113296.0,16269408.0,11657328.0,528.0,...,5.34068e+22,2.418975e+18,109563600000000.0,4962508000.0,8.448681e+26,3.826695e+22,1.73324e+18,78504330000000.0,3555728000.0,161051.0


In [8]:
data = new_data

# Train-Test split

In [14]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
sss.get_n_splits(data, y_classes)
for train_index, test_index in sss.split(data, y_classes):
    X_train, X_test = data.iloc[train_index], data.iloc[test_index]
    y_train, y_test = y_classes.iloc[train_index], y_classes.iloc[test_index]

# FItting and Getting results

In [15]:
pred, acc, max_acc, estimators, best_est, best_X_test = get_estimators_predictions(X_train, y_train,
                                        #RandomForestClassifier(n_estimators=100, max_depth=3, max_samples = 0.3), 
                                        KNeighborsClassifier(10),  
                                        X_test, y_test, 2, 251, n_estimators=100)
predicted = make_prediction(pred)
best_X_test.columns.values

array([ 13, 251], dtype=int64)

In [16]:
print('max accuracy = ' + str(max_acc))
print ('majority accuracy = ' + str(accuracy_score(predicted, y_test)))

max accuracy = 0.33212996389891697
majority accuracy = 0.2563176895306859


In [17]:
X_test_new = X_test
for i in range(1, len(X_test.columns.values)+1):
    if i == 13 or i == 251:
        continue
    else :
        X_test_new = X_test_new.drop(i ,axis=1)
new_predicted = best_est.predict(X_test_new)
print ('best estimator accuracy = ' + str(accuracy_score(new_predicted, y_test)))

best estimator accuracy = 0.33212996389891697


In [None]:
np.sort(acc)