# Pre processing of dataset

In [1]:
import plotly.io as pio
pio.renderers.default = "notebook+pdf"

In [2]:
import pandas as pd
import numpy as np

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
from typing import List
from sklearn.preprocessing import RobustScaler,StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import RandomOverSampler

from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

import warnings

warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv("./Data/kag_risk_factors_cervical_cancer.csv")
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [4]:
target = 'Biopsy'

In [5]:
df=df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 835 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 835 non-null    int64 
 1   Number of sexual partners           835 non-null    object
 2   First sexual intercourse            835 non-null    object
 3   Num of pregnancies                  835 non-null    object
 4   Smokes                              835 non-null    object
 5   Smokes (years)                      835 non-null    object
 6   Smokes (packs/year)                 835 non-null    object
 7   Hormonal Contraceptives             835 non-null    object
 8   Hormonal Contraceptives (years)     835 non-null    object
 9   IUD                                 835 non-null    object
 10  IUD (years)                         835 non-null    object
 11  STDs                                835 non-null    object

In [6]:
df = df.replace('?', np.nan)

print("Check all Nan counts")
df.isnull().sum()

Check all Nan counts


Age                                     0
Number of sexual partners              25
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               103
Hormonal Contraceptives (years)       103
IUD                                   112
IUD (years)                           112
STDs                                  100
STDs (number)                         100
STDs:condylomatosis                   100
STDs:cervical condylomatosis          100
STDs:vaginal condylomatosis           100
STDs:vulvo-perineal condylomatosis    100
STDs:syphilis                         100
STDs:pelvic inflammatory disease      100
STDs:genital herpes                   100
STDs:molluscum contagiosum            100
STDs:AIDS                             100
STDs:HIV                              100
STDs:Hepatitis B                  

In [7]:
df=df._convert(numeric=True)
df = df.fillna(df.median())

print("Check all Nan counts")
df.isnull().sum()

Check all Nan counts


Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0


# Create a spark session

In [8]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
conf = SparkConf().setAppName("MyApp").set("spark.executor.cores", "2")
spark = SparkSession.builder.config(conf=conf).getOrCreate()


24/01/17 14:11:43 WARN Utils: Your hostname, Thuans-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.68.51 instead (on interface en0)
24/01/17 14:11:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/17 14:11:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Define the function that would be use for training for spark

In [9]:
from sklearn.model_selection import train_test_split
from pyspark.ml.classification import LogisticRegression as LogisticRegression_Spark
from pyspark.ml.classification import DecisionTreeClassifier as DecisionTreeClassifier_Spark
from pyspark.ml.classification import RandomForestClassifier as RandomForestClassifier_Spark
from pyspark.ml.classification import GBTClassifier as GBTClassifier_Spark
from pyspark.ml.classification import LinearSVC as LinearSVC_Spark
from pyspark.ml.classification import NaiveBayes as NaiveBayes_Spark
from pyspark.ml.classification import FMClassifier as FMClassifier_Spark
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import timeit

def training_loop(df,performance_metrics):
    #Split data into training and testing dataset
    X = df.drop([target], axis=1)
    y = df[target].copy()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,stratify = y, shuffle=True)#stratify = y

    #Apply adasyn only on the training data
    adasyn = ADASYN()
    X_train,y_train = adasyn.fit_resample(X_train,y_train)

    #Create a new seperate training set in Spark dataframe
    df_train=X_train
    df_train[target]=y_train
    df_spark_train=spark.createDataFrame(df_train)
    #Create a sepearte testing set in Spark dataframe:
    df_test=X_test
    df_test[target]=y_test
    df_spark_test=spark.createDataFrame(df_test)

    #Create a list of all models need for training purpose
    model_list=[]
    model_list.append(("LogRegression",LogisticRegression_Spark(featuresCol="features", labelCol=target)))
    model_list.append(("DecTree",DecisionTreeClassifier_Spark(featuresCol="features", labelCol=target)))
    model_list.append(("RandomForrest",RandomForestClassifier_Spark(featuresCol="features", labelCol=target)))
    model_list.append(("GBTClassifier",GBTClassifier_Spark(featuresCol="features", labelCol=target)))
    model_list.append(("LinearSVC",LinearSVC_Spark(featuresCol="features", labelCol=target)))
    model_list.append(("NaiveBayes",NaiveBayes_Spark(featuresCol="features", labelCol=target)))
    model_list.append(("FMClassifier",FMClassifier_Spark(featuresCol="features", labelCol=target)))

    #Call vector assemvler as only vector assembler works with Spark
    Assembler=VectorAssembler(inputCols=df_spark_train.columns[:-1],
                           outputCol='features')
    df_spark_train=Assembler.transform(df_spark_train)
    Assembler=VectorAssembler(inputCols=df_spark_test.columns[:-1],
                           outputCol='features')
    df_spark_test=Assembler.transform(df_spark_test)
    df_spark_train=df_spark_train.select('features',target)
    df_spark_test=df_spark_test.select('features',target)

    #Output columns for output dataframe
    cols_name=['Name']
    for  p in performance_metrics:
        cols_name.append(p)
    cols_name.append('time(s)')
    performance_df_spark = pd.DataFrame(columns = cols_name)

    trainDF=df_spark_train
    testDF=df_spark_test
    
    #Handle all normal model
    for model in model_list:
        start_time = timeit.default_timer()
        cur_model=model[1]
        cur_model=cur_model.fit(trainDF)
        pred=cur_model.transform(testDF)
        temp = [model[0]]
        
        for p in performance_metrics:
            evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol=target, metricName=p)
            temp.append(evaluator.evaluate(pred))
            
        end_time=timeit.default_timer()-start_time
        
        temp.append(end_time)
        performance_df_spark.loc[len(performance_df_spark.index)] = temp
        print(cur_model,end_time)
    
    return performance_df_spark






In [10]:
performance_metrics=['accuracy','precisionByLabel','recallByLabel','f1']
cols_name=['Name']
for  p in performance_metrics:
    cols_name.append(p)
cols_name.append('time(s)')
performance_df_spark = pd.DataFrame(columns = cols_name)

for _ in range(10):
    performance_df_spark = pd.concat([performance_df_spark, training_loop(df, performance_metrics)], ignore_index=True)


24/01/17 14:11:49 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


LogisticRegressionModel: uid=LogisticRegression_913db188309a, numClasses=2, numFeatures=35 8.054786417007563


24/01/17 14:11:54 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d6662d16e662, depth=5, numNodes=23, numClasses=2, numFeatures=35 1.5696001250034897
RandomForestClassificationModel: uid=RandomForestClassifier_bebb7aff80ad, numTrees=20, numClasses=2, numFeatures=35 1.4433906669873977
GBTClassificationModel: uid = GBTClassifier_2da6a4381263, numTrees=20, numClasses=2, numFeatures=35 4.785473458003253
LinearSVCModel: uid=LinearSVC_33d1da06abc7, numClasses=2, numFeatures=35 6.950561041012406
NaiveBayesModel: uid=NaiveBayes_7c66e2088774, modelType=multinomial, numClasses=2, numFeatures=35 0.7739829159982037
FMClassificationModel: uid=FMClassifier_573fbb175db5, numClasses=2, numFeatures=35, factorSize=8, fitLinear=true, fitIntercept=true 3.7033785000094213
LogisticRegressionModel: uid=LogisticRegression_b24b0643a7e2, numClasses=2, numFeatures=35 2.979323583000223
DecisionTreeClassificationModel: uid=DecisionTreeClassifier_056a91b2c8be, depth=5, numNodes=33, numClasses=2, numFeatures=35 0.98211316

In [11]:
avg_df=performance_df_spark.groupby('Name',as_index=False)[performance_metrics].mean()
acc_comparison = px.bar(avg_df, x="Name",
                        y=performance_metrics,
                        barmode="group")
acc_comparison.show()

In [12]:
px.bar(performance_df_spark.groupby('Name',as_index=False)['time(s)'].mean(),x="Name",y=["time(s)"],color='Name')


In [13]:
performance_df_spark.groupby('Name',as_index=False).mean()


Unnamed: 0,Name,accuracy,precisionByLabel,recallByLabel,f1,time(s)
0,DecTree,0.930072,0.978034,0.946512,0.936385,0.939973
1,FMClassifier,0.816304,0.965494,0.832171,0.844949,2.657505
2,GBTClassifier,0.940942,0.97232,0.964341,0.942284,3.162757
3,LinearSVC,0.953261,0.985402,0.964341,0.956033,6.235123
4,LogRegression,0.928261,0.974563,0.948062,0.93365,3.3033
5,NaiveBayes,0.817029,0.974094,0.826357,0.856961,0.557602
6,RandomForrest,0.957246,0.983949,0.970155,0.958978,1.00113
