In [1]:
%%html
<style>
/* Any CSS style can go in here. */
.dataframe th {
    font-size: 18px;
}
.dataframe td {
    font-size: 16px;
}
</style>

# Health Recommender System for Cervical Cancer Prognosis in Women!
The goal of this note book is to replicate the research done by the paper with the same name 
Overall some notes in implemenatation:
- SMOTE has to be used to balance the training data set
- Used different machines learning models for this dataset
- Biopsy is used as the target class
- Use MOGA(Multi-Objective Genetic Algorithm to narrow down the feature sets --- Later) 

Cited paper:
Kuanr, Madhusree, et al. “Health Recommender System for Cervical Cancer Prognosis in Women.” 2021 6th International Conference on Inventive Computation Technologies (ICICT), Inventive Computation Technologies (ICICT), 2021 6th International Conference On, Jan. 2021, pp. 673–79. EBSCOhost, https://doi-org.ezproxy.mtsu.edu/10.1109/ICICT50816.2021.9358540.

In [2]:
import plotly.io as pio
pio.renderers.default = "notebook+pdf"

In [3]:
import pandas as pd
import numpy as np

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
from typing import List
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import RandomOverSampler

from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

import warnings

warnings.filterwarnings('ignore')

In [4]:
from catboost import CatBoostClassifier
from catboost import Pool

def fit_model(train_pool, validation_pool, **kwargs):
    model = CatBoostClassifier(
        iterations=1000,
        use_best_model= True,
        #task_type='GPU'
    )

    return model.fit(
        train_pool,
        eval_set=validation_pool,
        verbose=100,
    )

In [5]:
df=pd.read_csv("./data/kag_risk_factors_cervical_cancer.csv")
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [6]:
target = 'Biopsy'

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 858 non-null    int64 
 1   Number of sexual partners           858 non-null    object
 2   First sexual intercourse            858 non-null    object
 3   Num of pregnancies                  858 non-null    object
 4   Smokes                              858 non-null    object
 5   Smokes (years)                      858 non-null    object
 6   Smokes (packs/year)                 858 non-null    object
 7   Hormonal Contraceptives             858 non-null    object
 8   Hormonal Contraceptives (years)     858 non-null    object
 9   IUD                                 858 non-null    object
 10  IUD (years)                         858 non-null    object
 11  STDs                                858 non-null    object

Remove duplicates

In [8]:
df=df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 835 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 835 non-null    int64 
 1   Number of sexual partners           835 non-null    object
 2   First sexual intercourse            835 non-null    object
 3   Num of pregnancies                  835 non-null    object
 4   Smokes                              835 non-null    object
 5   Smokes (years)                      835 non-null    object
 6   Smokes (packs/year)                 835 non-null    object
 7   Hormonal Contraceptives             835 non-null    object
 8   Hormonal Contraceptives (years)     835 non-null    object
 9   IUD                                 835 non-null    object
 10  IUD (years)                         835 non-null    object
 11  STDs                                835 non-null    object

# Preprocessing

Replace "?" with the median for all columns.

In [9]:
df = df.replace('?', np.nan)

print("Check all Nan counts")
df.isnull().sum()

Check all Nan counts


Age                                     0
Number of sexual partners              25
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               103
Hormonal Contraceptives (years)       103
IUD                                   112
IUD (years)                           112
STDs                                  100
STDs (number)                         100
STDs:condylomatosis                   100
STDs:cervical condylomatosis          100
STDs:vaginal condylomatosis           100
STDs:vulvo-perineal condylomatosis    100
STDs:syphilis                         100
STDs:pelvic inflammatory disease      100
STDs:genital herpes                   100
STDs:molluscum contagiosum            100
STDs:AIDS                             100
STDs:HIV                              100
STDs:Hepatitis B                  

In [10]:
df=df._convert(numeric=True)
df = df.fillna(df.median())

print("Check all Nan counts")
df.isnull().sum()

Check all Nan counts


Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


Other tasks
- Since Hinselmann, Schiller, Citology, Biopsy are all cancer test, create an aggreated group that count all of these information.
- Try to create an aggregated STD column that count all STD prediction
- Create an column for age group for easy visualization

In [11]:
#Create age group for visualization
def age_group(n):
    if n < 12:
        return "Child"
    elif n < 20:
        return "Teen"
    elif n < 30:
        return "20's"
    elif n < 40:
        return "30's"
    elif n < 50:
        return "40's"
    elif n < 60:
        return "50's"
    elif n < 70:
        return "60's"
    else:
        return "70+"

df["age_cat"] = df["Age"].apply(age_group)

In [12]:
#Aggregated std
std_cols = ['STDs:condylomatosis',
            'STDs:cervical condylomatosis',
            'STDs:vaginal condylomatosis',
            'STDs:vulvo-perineal condylomatosis',
            'STDs:syphilis',
            'STDs:pelvic inflammatory disease',
            'STDs:genital herpes',
            'STDs:molluscum contagiosum',
            'STDs:AIDS',
            'STDs:HIV',
            'STDs:Hepatitis B',
            'STDs:HPV']
df["total_std"] = df[list(std_cols)].sum(axis=1)
std_agg_df = df.groupby("age_cat", as_index=False)[list(std_cols)].sum()

In [13]:
#Aggregated test result:
test_cols = ["Hinselmann", "Schiller", "Citology", "Biopsy"]
df["total_tests"] = df[test_cols].sum(axis = 1)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 835 entries, 0 to 857
Data columns (total 39 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Age                                 835 non-null    int64  
 1   Number of sexual partners           835 non-null    float64
 2   First sexual intercourse            835 non-null    float64
 3   Num of pregnancies                  835 non-null    float64
 4   Smokes                              835 non-null    float64
 5   Smokes (years)                      835 non-null    float64
 6   Smokes (packs/year)                 835 non-null    float64
 7   Hormonal Contraceptives             835 non-null    float64
 8   Hormonal Contraceptives (years)     835 non-null    float64
 9   IUD                                 835 non-null    float64
 10  IUD (years)                         835 non-null    float64
 11  STDs                                835 non-n

# Data Exploration

In [15]:
corr_matrix = df.corr()
corr_matrix.fillna(0,inplace=True)
corr_graph = px.imshow(corr_matrix, aspect="auto",title="Correlation Matrix")
corr_graph.show()

In [16]:
age_dist = px.histogram(df, x="Age", marginal="box")
age_dist.update_layout(title="Age distribution")
age_dist.show()

In [17]:
age_preg_bar = px.box(df, x="age_cat", y="Num of pregnancies", points="outliers",
                      category_orders=["Teenager", "Twenties", "Thirties", "Forties", "Fifties",
                                       "Seventy and over"])
age_preg_bar.update_xaxes(title="Age Category")
age_preg_bar.update_yaxes(title="Number of Pregnancies")
age_preg_bar.update_layout(title="Distribution of number of pregnancies per age group")
age_preg_bar.show()

In [18]:
diagnoses_num_partner_compare_cols = ['Dx:Cancer',
                                      'Dx:HPV',
                                      'Dx:CIN',
                                      "Number of sexual partners",]
corr_matrix = df[diagnoses_num_partner_compare_cols].corr()
diagnoses_num_partner_heatmap = px.imshow(corr_matrix,
                              aspect="auto",
                              text_auto=True,
                                         title='Correlation between diagnostic and and number of partners',
                                         color_continuous_scale="gnbu")
diagnoses_num_partner_heatmap.show()

In [19]:
diagnoses_cols = ['Hinselmann','Schiller','Citology','Biopsy']
diagnoses_corr_matrix = df[diagnoses_cols].corr()
diagnoses_heatmap = px.imshow(diagnoses_corr_matrix, aspect="auto", text_auto=True,color_continuous_scale="gnbu",
                              title='Diagnostic Correlation')
diagnoses_heatmap.show()

In [20]:
fig = px.histogram(std_agg_df, x="age_cat", y=list(std_cols), barmode="group", histfunc="sum")
fig.update_layout(title="Sum of STD occurence across age categories")
fig.update_xaxes(title="Age Category")
fig.update_yaxes(title="Sum")
fig.show()

# Distribution of Classes

In [21]:
df[target].value_counts()

0    781
1     54
Name: Biopsy, dtype: int64

In [22]:
dx_cancer = px.histogram(df, y=target)
dx_cancer.update_layout(bargap=0.2)
dx_cancer.update_layout(title = "Imbalance in target variable")
dx_cancer.show()

# SMOTE or ADASYN:
- Since we have a issues of imbalance data set, either Smote or Adasyn can be deployed to create synthetic data to balance the dataset
- ADASYN is used for now

In [23]:
X = df.drop([target, "age_cat"], axis=1)
y = df[target].copy()

In [24]:
for name in X.columns:
    print(name)

Age
Number of sexual partners
First sexual intercourse
Num of pregnancies
Smokes
Smokes (years)
Smokes (packs/year)
Hormonal Contraceptives
Hormonal Contraceptives (years)
IUD
IUD (years)
STDs
STDs (number)
STDs:condylomatosis
STDs:cervical condylomatosis
STDs:vaginal condylomatosis
STDs:vulvo-perineal condylomatosis
STDs:syphilis
STDs:pelvic inflammatory disease
STDs:genital herpes
STDs:molluscum contagiosum
STDs:AIDS
STDs:HIV
STDs:Hepatitis B
STDs:HPV
STDs: Number of diagnosis
STDs: Time since first diagnosis
STDs: Time since last diagnosis
Dx:Cancer
Dx:CIN
Dx:HPV
Dx
Hinselmann
Schiller
Citology
total_std
total_tests


In [25]:
# smote = SMOTE(random_state=42)
# x_smote, y_smote = smote.fit_resample(X, y)
# risk_factor_df = x_smote.join(y_smote)
# risk_factor_df["age_cat"] = risk_factor_df["Age"].apply(age_cat)

In [26]:
adasyn = ADASYN(random_state=42)
X,y = adasyn.fit_resample(X,y)

In [27]:
X

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,total_std,total_tests
0,18,4.000000,15.000000,1.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,3.0,0,0,0,0,0,0,0,0.0,0
1,15,1.000000,14.000000,1.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,3.0,0,0,0,0,0,0,0,0.0,0
2,34,1.000000,17.000000,1.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,...,3.0,0,0,0,0,0,0,0,0.0,0
3,52,5.000000,16.000000,4.000000,1.0,37.0,37.0,1.000000,3.000000,0.0,...,3.0,1,0,1,0,0,0,0,0.0,0
4,46,3.000000,21.000000,4.000000,0.0,0.0,0.0,1.000000,15.000000,0.0,...,3.0,0,0,0,0,0,0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,17,1.541296,13.623889,2.541296,0.0,0.0,0.0,1.000000,0.506607,0.0,...,3.0,0,0,0,0,0,0,0,0.0,2
1563,17,1.709061,15.545303,2.000000,0.0,0.0,0.0,0.290939,0.122195,0.0,...,3.0,0,0,0,0,0,0,0,0.0,1
1564,16,1.870969,14.612907,1.129031,0.0,0.0,0.0,0.129031,0.054193,0.0,...,3.0,0,0,0,0,0,0,0,0.0,2
1565,18,2.428174,15.570435,2.000000,0.0,0.0,0.0,1.000000,0.298605,0.0,...,3.0,0,0,0,0,0,0,0,0.0,1


In [28]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1562    1
1563    1
1564    1
1565    1
1566    1
Name: Biopsy, Length: 1567, dtype: int64

In [29]:
csv_SK=X
csv_SK[target]=y

In [30]:
csv_SK.to_csv("ADASYN.csv",index=False)

# Train - test split

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify = y, shuffle=True)#stratify = y

In [32]:
y_train.value_counts()

1    526
0    523
Name: Biopsy, dtype: int64

# PCA
- Principal Component Analysis is a method to reduce the dimenstion of larger dataset. This is used to increase performace. In this case 13 parameters can be used to explained 99% of the variable


In [33]:
XT = RobustScaler().fit_transform(X_train)
pca = PCA(n_components=0.99)
XT = pca.fit_transform(XT)

dimensions = px.bar(x=range(pca.n_components_), y=pca.explained_variance_ratio_,
                    color_discrete_sequence=["blue"],
                    labels={"x":"PCA Feature","y":"Explained Variance"})
dimensions.show()
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

explained_variance = px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"},
    color_discrete_sequence=["blue"]
)
explained_variance.show()

# Model Application SkLearn
- 5 models are considered for this project
    - Logistic Regression
    - Random forest Classifier
    - KNeighbor Classifier
    - Support Vector Machine
    - CatBoost

In [34]:
pipeline = Pipeline([
    ("scaler", RobustScaler()),
    ("pca", PCA(n_components=13))
])
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [35]:
param_grid = {'C': np.logspace(-5, 8, 15)}
logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, param_grid, cv=10)

In [36]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=2, n_jobs=-1,
                                 max_features="sqrt")

In [37]:
knn_clf = KNeighborsClassifier()
knn_param_grid = {"n_neighbors": list(np.arange(1, 10, 2))}
knn_clf_cv = GridSearchCV(knn_clf, knn_param_grid, cv=10)

In [38]:
svm_clf = SVC()
svc_param_grid = {'C': np.logspace(-3, 2, 6), 'gamma': np.logspace(-3, 2, 6), }
svm_clf_cv = GridSearchCV(svm_clf, svc_param_grid, cv=5)

In [39]:
train_pool_CatBoost=Pool(X_train, y_train)
validation_pool_CatBoost = Pool(X_test, y_test)
cat_boost_model_done = fit_model(train_pool_CatBoost, validation_pool_CatBoost)

Learning rate set to 0.032066
0:	learn: 0.6461432	test: 0.6486247	best: 0.6486247 (0)	total: 58.7ms	remaining: 58.6s
100:	learn: 0.0494184	test: 0.0836133	best: 0.0836133 (100)	total: 256ms	remaining: 2.28s
200:	learn: 0.0204214	test: 0.0602042	best: 0.0602042 (200)	total: 371ms	remaining: 1.48s
300:	learn: 0.0103646	test: 0.0543581	best: 0.0543581 (300)	total: 485ms	remaining: 1.13s
400:	learn: 0.0064285	test: 0.0533791	best: 0.0533791 (400)	total: 600ms	remaining: 896ms
500:	learn: 0.0045369	test: 0.0535514	best: 0.0530437 (419)	total: 713ms	remaining: 710ms
600:	learn: 0.0035551	test: 0.0539000	best: 0.0530437 (419)	total: 823ms	remaining: 546ms
700:	learn: 0.0029879	test: 0.0546326	best: 0.0530437 (419)	total: 930ms	remaining: 397ms
800:	learn: 0.0026791	test: 0.0550235	best: 0.0530437 (419)	total: 1.03s	remaining: 257ms
900:	learn: 0.0025222	test: 0.0552395	best: 0.0530437 (419)	total: 1.14s	remaining: 125ms
999:	learn: 0.0024100	test: 0.0551605	best: 0.0530437 (419)	total: 1.24s	

In [40]:
col_names = ["Classifier Name", "Accuracy Score", "Precision Score",
             "Recall Score", "F1 Score"]
summary_df = pd.DataFrame(columns=col_names)

est_name = []
est_acc = []
precision_score = []
recall_score = []
f1score = []
est_conf_matrix = []

estimators = [
    ("LogisticRegression", logreg_cv),
    ("RandomForestClassifier ", rnd_clf),
    ("KNeighborsClassifier", knn_clf_cv),
    ("SupportVectorClassifier", svm_clf_cv),
    ("CatBoost",cat_boost_model_done)]

for i in range(0, len(estimators)):
    clf_name = estimators[i][0]
    clf = estimators[i][1]
    if clf_name!="CatBoost":
        clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # print(pd.crosstab(y_test,y_pred,rownames=["Actual"],colnames=["predicted"],margins=True))
    est_name.append(estimators[i][0])
    est_acc.append(accuracy_score(y_test, y_pred))
    scores = precision_recall_fscore_support(y_test, y_pred, average="weighted")
    precision_score.append(scores[0])
    recall_score.append(scores[1])
    f1score.append(scores[2])
    est_conf_matrix.append(confusion_matrix(y_test,y_pred))


summary_df[col_names[0]] = est_name
summary_df[col_names[1]] = est_acc
summary_df[col_names[2]] = precision_score
summary_df[col_names[3]] = recall_score
summary_df[col_names[4]] = f1score


In [41]:
color_scales = ["agsunset","teal","purp","viridis","fall"]
for i in range(0,len(est_conf_matrix)):
    heatmap = px.imshow(est_conf_matrix[i],aspect="auto",
                        text_auto=True,
                        color_continuous_scale=color_scales[i])
    heatmap.update_layout(title = est_name[i])
    heatmap.update_xaxes(title="Predicted")
    heatmap.update_yaxes(title="Actual")
    heatmap.show()

In [42]:
acc_comparison = px.bar(summary_df, x="Classifier Name",
                        y=col_names[1:len(col_names)],
                        color_discrete_sequence=["deeppink",
                                                 "deepskyblue",
                                                 "darkviolet",
                                                 "black"],
                        barmode="group")
acc_comparison.show()

# Application Spark
Spark tend to perform faster when reading from csv compared to using pandas dataframe thus, the X and y df has been made into X and y csv file

In [43]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
conf = SparkConf().setAppName("MyApp")


spark = SparkSession.builder.config(conf=conf).getOrCreate()


23/06/28 09:13:53 WARN Utils: Your hostname, Thuans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.14 instead (on interface en0)
23/06/28 09:13:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/28 09:13:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [44]:
df_shuffle=X

In [45]:
df_shuffle[target]=y

In [46]:
df_shuffle=df_shuffle.sample(frac=1)

In [47]:
df_shuffle

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,total_std,total_tests,Biopsy
1008,29,2.503048,15.872714,3.375762,0.0,0.0,0.0,1.000000,3.624238,0.0,...,0,0,0,0,0,0,0,0.000000,2,1
178,30,1.000000,16.000000,2.000000,0.0,0.0,0.0,1.000000,9.000000,0.0,...,0,0,0,0,0,0,0,0.000000,0,0
795,35,4.000000,18.000000,2.000000,0.0,0.0,0.0,1.000000,2.000000,0.0,...,0,0,0,0,0,0,0,0.000000,0,0
1351,20,3.000000,17.000000,1.215474,0.0,0.0,0.0,0.607737,0.151934,0.0,...,0,0,0,0,0,0,0,0.000000,1,1
756,19,2.000000,16.000000,1.000000,0.0,0.0,0.0,1.000000,0.500000,0.0,...,0,0,0,0,0,0,0,0.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,27,1.000000,18.000000,3.000000,0.0,0.0,0.0,1.000000,0.500000,0.0,...,0,0,0,0,1,1,0,0.000000,3,1
1498,20,1.000000,17.814474,1.185526,0.0,0.0,0.0,1.000000,11.959219,0.0,...,0,0,0,0,1,1,0,0.000000,3,1
347,20,1.000000,18.000000,2.000000,0.0,0.0,0.0,1.000000,0.500000,0.0,...,0,0,0,0,0,0,0,0.000000,0,0
853,35,1.896788,17.103212,2.793577,0.0,0.0,0.0,1.000000,0.922591,0.0,...,0,0,0,0,0,1,0,0.206423,3,1


In [48]:
df_spark=spark.createDataFrame(df_shuffle)

In [49]:
from pyspark.sql.functions import rand 
from pyspark.ml.classification import LogisticRegression as LogisticRegression_Spark
from pyspark.ml.classification import DecisionTreeClassifier as DecisionTreeClassifier_Spark
from pyspark.ml.classification import RandomForestClassifier as RandomForestClassifier_Spark
from pyspark.ml.classification import GBTClassifier as GBTClassifier_Spark
from pyspark.ml.classification import LinearSVC as LinearSVC_Spark
from pyspark.ml.classification import NaiveBayes as NaiveBayes_Spark
from pyspark.ml.classification import FMClassifier as FMClassifier_Spark
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [50]:
model_list=[]
model_list.append(("LogRegression",LogisticRegression_Spark(featuresCol="features", labelCol=target)))
model_list.append(("DecTree",DecisionTreeClassifier_Spark(featuresCol="features", labelCol=target)))
model_list.append(("RandomForrest",RandomForestClassifier_Spark(featuresCol="features", labelCol=target)))
model_list.append(("GBTClassifier",GBTClassifier_Spark(featuresCol="features", labelCol=target)))
model_list.append(("LinearSVC",LinearSVC_Spark(featuresCol="features", labelCol=target)))
model_list.append(("NaiveBayes",NaiveBayes_Spark(featuresCol="features", labelCol=target)))
model_list.append(("FMClassifier",FMClassifier_Spark(featuresCol="features", labelCol=target)))


In [51]:
for x in range(len(df_spark.columns[:-1])):
    print(df_spark.columns[x])
Assembler=VectorAssembler(inputCols=df_spark.columns[:-1],
                           outputCol='features')
df_spark=Assembler.transform(df_spark)


Age
Number of sexual partners
First sexual intercourse
Num of pregnancies
Smokes
Smokes (years)
Smokes (packs/year)
Hormonal Contraceptives
Hormonal Contraceptives (years)
IUD
IUD (years)
STDs
STDs (number)
STDs:condylomatosis
STDs:cervical condylomatosis
STDs:vaginal condylomatosis
STDs:vulvo-perineal condylomatosis
STDs:syphilis
STDs:pelvic inflammatory disease
STDs:genital herpes
STDs:molluscum contagiosum
STDs:AIDS
STDs:HIV
STDs:Hepatitis B
STDs:HPV
STDs: Number of diagnosis
STDs: Time since first diagnosis
STDs: Time since last diagnosis
Dx:Cancer
Dx:CIN
Dx:HPV
Dx
Hinselmann
Schiller
Citology
total_std
total_tests


In [52]:
df_train_test_spark=df_spark.select('features',target)
df_train_test_spark.show()


23/06/28 09:13:56 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+------+
|            features|Biopsy|
+--------------------+------+
|(37,[0,1,2,3,7,8,...|     1|
|(37,[0,1,2,3,7,8,...|     0|
|(37,[0,1,2,3,7,8,...|     0|
|(37,[0,1,2,3,7,8,...|     1|
|(37,[0,1,2,3,7,8,...|     0|
|(37,[0,1,2,3,7,8,...|     0|
|(37,[0,1,2,3,26,2...|     0|
|(37,[0,1,2,3,4,5,...|     0|
|(37,[0,1,2,3,7,8,...|     1|
|(37,[0,1,2,3,7,8,...|     1|
|(37,[0,1,2,3,7,8,...|     0|
|(37,[0,1,2,3,4,5,...|     1|
|(37,[0,1,2,3,26,2...|     1|
|(37,[0,1,2,3,7,8,...|     1|
|(37,[0,1,2,3,7,8,...|     0|
|(37,[0,1,2,3,7,8,...|     1|
|(37,[0,1,2,3,7,8,...|     0|
|(37,[0,1,2,3,7,8,...|     0|
|(37,[0,1,2,3,7,8,...|     0|
|(37,[0,1,2,3,7,8,...|     0|
+--------------------+------+
only showing top 20 rows



                                                                                

In [53]:
import timeit

performance_metrics=['accuracy','precisionByLabel','recallByLabel','f1']
cols_name=['Name','Iteration']

for  p in performance_metrics:
    cols_name.append(p)
cols_name.append('time(s)')
performance_df_spark = pd.DataFrame(columns = cols_name)


In [54]:
folds=10
for itteration in range(folds):
    #print(itteration)
    df_shuffle=df_train_test_spark.orderBy(rand())
    trainDF,testDF=df_train_test_spark.randomSplit([0.75,0.25])
    
    #Handle all normal model
    for model in model_list:
        start_time = timeit.default_timer()
        cur_model=model[1]
        cur_model=cur_model.fit(trainDF)
        pred=cur_model.transform(testDF)
        temp = [model[0],itteration]
        
        for p in performance_metrics:
            evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=target, metricName=p)
            temp.append(evaluator.evaluate(pred))
            
        end_time=timeit.default_timer()-start_time
        
        temp.append(end_time)
        performance_df_spark.loc[len(performance_df_spark.index)] = temp
        #print(cur_model,end_time)
    


23/06/28 09:13:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/06/28 09:13:58 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
23/06/28 09:14:06 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [55]:
avg_df=performance_df_spark.groupby('Name',as_index=False)[performance_metrics].mean()
acc_comparison = px.bar(avg_df, x="Name",
                        y=performance_metrics,
                        barmode="group")
acc_comparison.update_layout(
    font=dict(
        size=18,  # Set the font size here    
    )
)
acc_comparison.show()


In [61]:
time=px.bar(performance_df_spark.groupby('Name',as_index=False)['time(s)'].mean(),x="Name",y=["time(s)"],color='Name')
time.update_layout(
    font=dict(
        size=18,  # Set the font size here    
    )
)
time.show()

In [86]:
%%html
<style>
/* Any CSS style can go in here. */
.dataframe th {
    font-size: 20px;
}
.dataframe td {
    font-size: 18px;
}
</style>
from IPython.display import HTML
HTML(df.to_html(index=False))

In [91]:
temp=performance_df_spark.groupby('Name',as_index=False).mean()
temp=temp.drop(['Iteration'],axis=1)
temp

Unnamed: 0,Name,accuracy,precisionByLabel,recallByLabel,f1,time(s)
0,DecTree,0.988297,0.995238,0.980288,0.988303,0.997477
1,FMClassifier,0.984677,0.992093,0.977215,0.984675,2.606471
2,GBTClassifier,0.992204,0.994841,0.988871,0.992212,3.015234
3,LinearSVC,1.0,1.0,1.0,1.0,1.933421
4,LogRegression,1.0,1.0,1.0,1.0,1.522845
5,NaiveBayes,0.883407,0.896042,0.865482,0.883394,0.576943
6,RandomForrest,0.981989,1.0,0.964434,0.981988,1.023844


pandas.core.frame.DataFrame

In [58]:
performance_df_spark


Unnamed: 0,Name,Iteration,accuracy,precisionByLabel,recallByLabel,f1,time(s)
0,LogRegression,0,1.000000,1.000000,1.000000,1.000000,3.548273
1,DecTree,0,0.983957,1.000000,0.968085,0.983954,1.520137
2,RandomForrest,0,0.973262,1.000000,0.946809,0.973247,1.320059
3,GBTClassifier,0,0.989305,1.000000,0.978723,0.989304,3.650941
4,LinearSVC,0,1.000000,1.000000,1.000000,1.000000,2.390824
...,...,...,...,...,...,...,...
65,RandomForrest,9,0.992327,1.000000,0.984293,0.992326,0.938196
66,GBTClassifier,9,0.997442,1.000000,0.994764,0.997442,2.830288
67,LinearSVC,9,1.000000,1.000000,1.000000,1.000000,1.899125
68,NaiveBayes,9,0.867008,0.867725,0.858639,0.866989,0.511537
