In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("cell_samples.csv")

In [3]:
df.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           699 non-null    int64 
 1   Clump        699 non-null    int64 
 2   UnifSize     699 non-null    int64 
 3   UnifShape    699 non-null    int64 
 4   MargAdh      699 non-null    int64 
 5   SingEpiSize  699 non-null    int64 
 6   BareNuc      699 non-null    object
 7   BlandChrom   699 non-null    int64 
 8   NormNucl     699 non-null    int64 
 9   Mit          699 non-null    int64 
 10  Class        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [5]:
df['BareNuc'] = (pd.to_numeric(df['BareNuc'], errors='coerce')).notnull()  #Convert object to numeric

In [6]:
df['BareNuc'] = df['BareNuc'].astype(int) #Convert numeric to int

In [7]:
#Applying Variance

In [8]:
from sklearn.feature_selection import VarianceThreshold

In [9]:
vart = VarianceThreshold(threshold=0.5)

In [10]:
op = vart.fit(df)

In [11]:
op.get_support()

array([ True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True])

In [12]:
c_names = df.columns[op.get_support()==False]
c_names

Index(['BareNuc'], dtype='object')

In [13]:
df.drop(c_names, axis=1, inplace=True)

In [14]:
cdf = df['Class']

In [15]:
df.drop('Class', axis=1, inplace=True)

In [16]:
#Applying MIC

In [17]:
from sklearn.feature_selection import mutual_info_classif

In [18]:
MIC = mutual_info_classif(df, cdf)

In [19]:
MIC

array([0.05306316, 0.32884256, 0.46816718, 0.45692754, 0.31217963,
       0.35669666, 0.39134695, 0.3355688 , 0.14600154])

In [20]:
mic_series = pd.Series(MIC)
mic_series

0    0.053063
1    0.328843
2    0.468167
3    0.456928
4    0.312180
5    0.356697
6    0.391347
7    0.335569
8    0.146002
dtype: float64

In [21]:
cl_names = df.columns
cl_names

Index(['ID', 'Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize',
       'BlandChrom', 'NormNucl', 'Mit'],
      dtype='object')

In [22]:
mic_series.index = cl_names

In [23]:
mic_series

ID             0.053063
Clump          0.328843
UnifSize       0.468167
UnifShape      0.456928
MargAdh        0.312180
SingEpiSize    0.356697
BlandChrom     0.391347
NormNucl       0.335569
Mit            0.146002
dtype: float64

In [24]:
mic_series.sort_values(ascending=False)

UnifSize       0.468167
UnifShape      0.456928
BlandChrom     0.391347
SingEpiSize    0.356697
NormNucl       0.335569
Clump          0.328843
MargAdh        0.312180
Mit            0.146002
ID             0.053063
dtype: float64

In [25]:
removable_cl = []
for i in mic_series.index:
    if mic_series[i] < 0.2:
        print(i, mic_series[i])
        removable_cl.append(i)

ID 0.05306316222197416
Mit 0.14600153783611658


In [26]:
df.drop(removable_cl, axis=1, inplace=True)

In [27]:
# Converting into array

In [28]:
x = np.asarray(df)

In [29]:
x[0:5]

array([[5, 1, 1, 1, 2, 3, 1],
       [5, 4, 4, 5, 7, 3, 2],
       [3, 1, 1, 1, 2, 3, 1],
       [6, 8, 8, 1, 3, 3, 7],
       [4, 1, 1, 3, 2, 3, 1]], dtype=int64)

In [30]:
y = np.asarray(cdf)

In [31]:
y[0:5]

array([2, 2, 2, 2, 2], dtype=int64)

In [32]:
#Applying SD

In [33]:
from sklearn import preprocessing 

In [34]:
x = preprocessing.StandardScaler().fit_transform(x)

In [35]:
# Creating Model

In [36]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state= 156) 

In [37]:
print('Train status:', x_train.shape, y_train.shape)
print('Test status:', x_test.shape, y_test.shape)

Train status: (559, 7) (559,)
Test status: (140, 7) (140,)


In [38]:
from sklearn.svm import SVC 

In [39]:
classifier = SVC(kernel='poly')
classifier.fit(x_train, y_train)

SVC(kernel='poly')

In [40]:
y_pred = classifier.predict(x_test)

In [41]:
from sklearn.metrics import accuracy_score as acc

In [42]:
print("Accuracy:", acc(y_test, y_pred)*100)

Accuracy: 95.71428571428572


In [None]:
import pickle
pickle.dump(classifier, open('model.pkl','wb'))