# Part 1: Preprocess pipeline

In [1]:
# import modules for first part:
import pandas as pd
import pandas_profiling
import numpy as np
from sklearn.impute import SimpleImputer


In [2]:
# remove columns with no dispersion
# impute the values with null values


In [3]:
 
def removing_columns(df):
    '''
    remove columns with 0 variance
    or more than 90% missing values:
    they are 4 columns in the train dataset
    '''
    df=df.drop(['STDs:cervical condylomatosis','STDs:AIDS', 'STDs: Time since first diagnosis','STDs: Time since last diagnosis'],axis=1)
    return df

def fill_question_marks(df):
    df1 = df.replace('?', np.NaN)
    return df1

def convert_to_numeric(df):
    '''
    convert all columns into numeric
    ''' 
    cols = df.columns
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
    return df
        
def fill_num_na_for_train(df):
    '''
    After convert to numeric all the columns will be numeric
    as the objects were actually numeric columns as strings
    But a pipeline should be used so the test data is transformed with the 
    imputer fitted on the train data.
    '''
    
    imputer = SimpleImputer(strategy="median")
    df = imputer.fit_transform(df)
    return df

#def clean_y(y)

def preprocess(df_train, df_test, df_validate):
    
    '''
    Apply the other functions and impute the null values with SimpleImputer.
    After convert to numeric all the columns will be numeric
    as the objects were actually numeric columns as strings
    
    '''
    
    df_train=removing_columns(df_train)
    df_train=fill_question_marks(df_train)
    df_train=convert_to_numeric(df_train)
    
    df_test=removing_columns(df_test)
    df_test=fill_question_marks(df_test)
    df_test=convert_to_numeric(df_test)
    
    df_validate=removing_columns(df_validate)
    df_validate=fill_question_marks(df_validate)
    df_validate=convert_to_numeric(df_validate)
    
    #impute with the median of train
    imputer = SimpleImputer(strategy="median")
    np_train = imputer.fit_transform(df_train)
    np_test = imputer.transform(df_test)
    np_validate = imputer.transform(df_validate)
    
    # convert np arrays to DataFrame again to manipulate them
    df_train = pd.DataFrame(np_train, columns=df_train.columns)
    df_test = pd.DataFrame(np_test, columns=df_train.columns)
    df_validate = pd.DataFrame(np_validate, columns=df_train.columns)
    
    return df_train, df_test, df_validate



In [4]:
#load the data
df1=pd.read_csv("../Data/kag_risk_factors_cervical_cancer.csv")

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 858 non-null    int64 
 1   Number of sexual partners           858 non-null    object
 2   First sexual intercourse            858 non-null    object
 3   Num of pregnancies                  858 non-null    object
 4   Smokes                              858 non-null    object
 5   Smokes (years)                      858 non-null    object
 6   Smokes (packs/year)                 858 non-null    object
 7   Hormonal Contraceptives             858 non-null    object
 8   Hormonal Contraceptives (years)     858 non-null    object
 9   IUD                                 858 non-null    object
 10  IUD (years)                         858 non-null    object
 11  STDs                                858 non-null    object

In [6]:
df1.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [7]:
#divive the data in X and Y
X = df1.drop(columns="Biopsy")
y = df1["Biopsy"]

#spliting the validation data
from sklearn.model_selection import train_test_split
X_t, X_validate, y_t, y_validate = train_test_split(X, y, test_size=0.1, random_state=0)

# splitting the test and train data
X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.15, random_state=0)

In [8]:
#save a profile of the X_train data. Previous exploration has already been performed
#get a description of the features:
profile = pandas_profiling.ProfileReport(X_train, title="Cervix cancer risk Data profiling")
profile.to_file("../Data/Cervix_cancer_data_profiling_report.html")

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=49.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [9]:
#Apply the processing function to all the X files:
#retunrs numpy arrays

X_train_ready, X_test_ready, X_validate_ready=preprocess(X_train, X_test, X_validate)

In [10]:
# Look at the cleaned dataframe first rows:
X_train_ready.head(2)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology
0,14.0,2.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,29.0,5.0,17.0,6.0,1.0,1.266973,1.3,1.0,0.5,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
X_test_ready.head(2)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology
0,15.0,4.0,14.0,1.0,1.0,1.0,0.1,1.0,0.08,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,40.0,2.0,17.0,3.0,0.0,0.0,0.0,1.0,19.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
X_validate_ready.head(2)

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology
0,30.0,2.0,17.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,43.0,4.0,16.0,3.0,1.0,28.0,7.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#save them in a csv file:

X_train_ready.to_csv("../Data/X_train_preprocessed.csv")
X_test_ready.to_csv("../Data/X_test_preprocessed.csv")
X_validate_ready.to_csv("../Data/X_validate_preprocessed.csv")
y_train.to_csv("../Data/y_train_preprocessed.csv")
y_test.to_csv("../Data/y_test_preprocessed.csv")
y_validate.to_csv("../Data/y_validate_preprocessed.csv")

## Part 1.2 : Testing with Linear Regression

In [23]:
print(f"Dataset was not very large. The train dataset has {len(y_train)} rows, the test {len(y_test)}, and the validation {len(y_validate)}")


Dataset was not very large. The train dataset has 656 rows, the test 116, and the validation 86


In [24]:
from sklearn import metrics
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt

In [25]:
reg = LinearRegression().fit(X_train_ready, y_train)
reg.score(X_train_ready, y_train)

0.6600773758671179

In [26]:
y_pred=reg.predict(X_test_ready)

In [27]:
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test, y_pred))


Coefficient of determination: 0.46
