In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

### Load dataset used for testing

In [2]:
data = pd.read_csv('test_no_class.csv')
data.head()

Unnamed: 0,AGE,SEX,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,LIVER BIG,LIVER FIRM,SPLEEN PALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,ALK PHOSPHATE,SGOT,ALBUMIN,PROTIME,HISTOLOGY
0,39,Male,2,2,1,2,2,2,1,2,2,2,2,0.7,?,48,4.4,?,1
1,41,Feamle,2,1,1,1,1,2,2,2,2,2,2,0.7,81,53,5.0,74,1
2,28,maled,1,2,1,1,1,2,1,2,2,2,2,1.6,44,123,4.0,46,1
3,36,maled,1,2,1,1,1,2,1,2,2,2,2,1.0,?,45,4.0,57,1
4,32,M,2,2,2,2,2,2,2,2,2,2,2,0.7,102,64,4.0,90,1


### Clean the data

In [3]:
def replace_by_mean(data, col, c):
    data[col].mask(data[col] == c, 0, inplace=True)
    data[col] = pd.to_numeric(data[col])
    data[col].mask(data[col] == 0, data[col].mean(), inplace=True)

def clean_data(data):
    df = data.copy()
    df['SEX'] = df['SEX'].replace({'m':'M', 'meale':'M', 'Male':'M',
                                   'maled':'M', 'male':'M', 'Feamle':'F',
                                   'female':'F'})
    replace_by_mean(df, 'PROTIME', '?')
    replace_by_mean(df, 'STEROID', '?')
    replace_by_mean(df, 'ANTIVIRALS', '?')
    replace_by_mean(df, 'FATIGUE', '?')
    replace_by_mean(df, 'MALAISE', '?')
    replace_by_mean(df, 'ANOREXIA', '?')
    replace_by_mean(df, 'LIVER BIG', '?')
    replace_by_mean(df, 'LIVER FIRM', '?')
    replace_by_mean(df, 'SPIDERS', '?')
    replace_by_mean(df, 'SPLEEN PALPABLE', '?')
    replace_by_mean(df, 'ASCITES', '?')
    replace_by_mean(df, 'BILIRUBIN', '?')
    replace_by_mean(df, 'ALK PHOSPHATE', '?')
    replace_by_mean(df, 'VARICES', '?')
    replace_by_mean(df, 'SGOT', '?')
    replace_by_mean(df, 'ALBUMIN', '?')
    replace_by_mean(df, 'HISTOLOGY', '?')
    
    return df

In [4]:
data = clean_data(data)

### Scale and encode the data

In [5]:
data = pd.get_dummies(data, drop_first=True)

In [6]:
with open('standard_scaler.pickle', 'rb') as file:
    scaler = pickle.load(file)

data_scaled = pd.DataFrame(scaler.transform(data), columns=data.columns)

### Load models

In [7]:
with open('random_forest.pickle', 'rb') as file:
    knn = pickle.load(file)

### Do the predictions

In [8]:
y_rF_pred  = randForest.predict(data)
#y_knn_pred = knn.predict(data)

### Store prediction result to CSV file

In [9]:
pd.DataFrame(pd.Series(y_knn_pred), columns=['Class']).to_csv('group_3_rf.csv')