In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
loveall_cervical_cancer_risk_classification_path = kagglehub.dataset_download('loveall/cervical-cancer-risk-classification')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler


In [None]:
data = pd.read_csv('../input/cervical-cancer-risk-classification/kag_risk_factors_cervical_cancer.csv')

In [None]:
data.head(5)

In [None]:
data = data.replace('?', np.nan)

In [None]:
data = data.drop(['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], axis=1)


In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.isna().sum()

In [None]:
data = data.apply(pd.to_numeric)
data = data.fillna(data.mean())

In [None]:
data.isna().sum()

# **Correlation between Data**

When the correlation approaches 1, the variables have a strong positive dependency, meaning they increase or decrease together in proportion. Conversely, when the correlation approaches -1, there is a strong negative dependency, indicating that as one variable increases, the other tends to decrease in proportion.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
correlation = data.corr()
plt.figure(figsize = (22,22))
sns.heatmap(correlation, annot=True)
plt.show()

In [None]:
histogram = data.hist(figsize= (18,18), color= 'red')
plt.show()

# **Setting Data**

In [None]:
X = data.iloc[:, :-1]
Y = data.iloc[:, -1]

# **Selecting best features**

In [None]:
from sklearn.feature_selection import SelectPercentile, chi2 , f_classif
FeatureSelection = SelectPercentile(score_func = chi2, percentile=30)
X = FeatureSelection.fit_transform(X,Y)

# **Polynimal - Scaling / perprocessing**

In [None]:
poly = PolynomialFeatures(degree=2)
X = poly.fit_transform(X)


std = StandardScaler()
X= std.fit_transform(X)

In [None]:
X.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

# **Splitting Data**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)

# **Logistic Regression Model**

In [None]:
LogisticRegressionModel = LogisticRegression(penalty='l2',solver='sag',C=1.0,random_state=33)
LogisticRegressionModel.fit(X_train ,Y_train)


In [None]:
logist_regress_train_score= LogisticRegressionModel.score(X_train, Y_train)
logist_regress_test_score=   LogisticRegressionModel.score(X_test, Y_test)

print(f'Using Logistic Regression, the score of training data is {round(logist_regress_train_score*100,2)} % and the score of test data is {round(logist_regress_test_score*100,2)} %')

In [None]:
import matplotlib.pyplot as plt


scores = [logist_regress_train_score * 100, logist_regress_test_score * 100]
labels = ['Training Score', 'Test Score']

# Plot the bar chart
plt.figure(figsize=(8, 6))
plt.bar(labels, scores, color=['blue', 'green'])
plt.ylabel('Score (%)')
plt.title('Logistic Regression Training and Test Scores')
plt.ylim(0, 100)  # Set y-axis limit from 0 to 100 for percentage
plt.show()


In [None]:
y_pred_logisReg = LogisticRegressionModel.predict(X_test)

conf = confusion_matrix(Y_test,y_pred_logisReg )

sns.heatmap(conf, annot= True)

# **Decision Tree Classifier**

In [None]:
DecisionTreeClassifierModel = DecisionTreeClassifier(criterion='gini',max_depth=2,random_state=42)
DecisionTreeClassifierModel.fit(X_train, Y_train)

In [None]:
Tree_train_score= DecisionTreeClassifierModel.score(X_train, Y_train)
Tree_test_score=   DecisionTreeClassifierModel.score(X_test, Y_test)

print(f'Using Decision Tree, the score of training data is {round(Tree_train_score*100,2)} % and the score of test data is {round(Tree_test_score*100,2)} %')

In [None]:
scores = [Tree_train_score* 100, Tree_test_score * 100]
labels = ['Training Score', 'Test Score']

# Plot the bar chart
plt.figure(figsize=(8, 6))
plt.bar(labels, scores, color=['blue', 'green'])
plt.ylabel('Score (%)')
plt.title('Decision Tree Training and Test Scores')
plt.ylim(0, 100)  # Set y-axis limit from 0 to 100 for percentage
plt.show()

In [None]:
y_pred_logisReg = DecisionTreeClassifierModel.predict(X_test)

conf = confusion_matrix(Y_test,y_pred_logisReg )

sns.heatmap(conf, annot= True)

# **SVC Model**

In [None]:
SVCModel = SVC(kernel= 'poly',max_iter=100,C=0.95,gamma='auto')
SVCModel.fit(X_train, Y_train)

In [None]:
SVC_train_score= SVCModel.score(X_train, Y_train)
SVC_test_score=   SVCModel.score(X_test, Y_test)

print(f'Using SVC, the score of training data is {round(SVC_train_score*100,2)} % and the score of test data is {round(SVC_test_score*100,2)} %')

In [None]:
scores = [SVC_train_score* 100, SVC_test_score * 100]
labels = ['Training Score', 'Test Score']

# Plot the bar chart
plt.figure(figsize=(8, 6))
plt.bar(labels, scores, color=['blue', 'green'])
plt.ylabel('Score (%)')
plt.title('SVC Training and Test Scores')
plt.ylim(0, 100)
plt.show()

In [None]:
y_pred_SVC = SVCModel.predict(X_test)

conf = confusion_matrix(Y_test,y_pred_SVC )

sns.heatmap(conf, annot= True)