In [15]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
from sklearn.metrics import accuracy_score

#load the dataset
def load_dataset(filename):
    data=read_csv(filename,header=None)
    dataset=data.values
    X=dataset[:,:-1]
    y=dataset[:,-1]
    X=X.astype(str)
    return X,y
#prepare input data
def prepare_inputs(X_train,X_test):
    oe=OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc=oe.transform(X_train)
    X_test_enc=oe.transform(X_test)
    return X_train_enc,X_test_enc
#prepare target 
def prepare_targets(y_train,y_test):
    le=LabelEncoder()
    le.fit(y_train)
    y_train_enc=le.transform(y_train)
    y_test_enc=le.transform(y_test)
    return y_train_enc,y_test_enc
X,y=load_dataset('breast-cancer.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_enc,X_test_enc=prepare_inputs(X_train,X_test)
y_train_enc,y_test_enc=prepare_targets(y_train,y_test)
print('Train', X_train_enc.shape, y_train_enc.shape)
print('Test', X_test_enc.shape, y_test_enc.shape)
# fit the model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train_enc, y_train_enc)
# evaluate the model
yhat = model.predict(X_test_enc)
# evaluate predictions
accuracy = accuracy_score(y_test_enc, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Train (191, 9) (191,)
Test (95, 9) (95,)
Accuracy: 75.79


In [17]:
# feature selection
def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=chi2, k=4)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs

In [19]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import chi2

#load the dataset
def load_dataset(filename):
    data=read_csv(filename,header=None)
    dataset=data.values
    X=dataset[:,:-1]
    y=dataset[:,-1]
    X=X.astype(str)
    return X,y
#prepare input data
def prepare_inputs(X_train,X_test):
    oe=OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc=oe.transform(X_train)
    X_test_enc=oe.transform(X_test)
    return X_train_enc,X_test_enc
#prepare target 
def prepare_targets(y_train,y_test):
    le=LabelEncoder()
    le.fit(y_train)
    y_train_enc=le.transform(y_train)
    y_test_enc=le.transform(y_test)
    return y_train_enc,y_test_enc
# feature selection
def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=chi2, k=4)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs
X,y=load_dataset('breast-cancer.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_enc,X_test_enc=prepare_inputs(X_train,X_test)
y_train_enc,y_test_enc=prepare_targets(y_train,y_test)
X_train_fs, X_test_fs = select_features(X_train_enc, y_train_enc, X_test_enc)
model=LogisticRegression(solver='lbfgs')
model.fit(X_train_fs,y_train_enc)
yhat=model.predict(X_test_fs)
accuracy=accuracy_score(y_test_enc,yhat)
print('Accuracy: %.2f' %(accuracy*100))

Accuracy: 74.74


In [21]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import chi2

#load the dataset
def load_dataset(filename):
    data=read_csv(filename,header=None)
    dataset=data.values
    X=dataset[:,:-1]
    y=dataset[:,-1]
    X=X.astype(str)
    return X,y
#prepare input data
def prepare_inputs(X_train,X_test):
    oe=OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc=oe.transform(X_train)
    X_test_enc=oe.transform(X_test)
    return X_train_enc,X_test_enc
#prepare target 
def prepare_targets(y_train,y_test):
    le=LabelEncoder()
    le.fit(y_train)
    y_train_enc=le.transform(y_train)
    y_test_enc=le.transform(y_test)
    return y_train_enc,y_test_enc
# feature selection using mutual information gain
def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=mutual_info_classif, k=4)
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs
X,y=load_dataset('breast-cancer.csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_enc,X_test_enc=prepare_inputs(X_train,X_test)
y_train_enc,y_test_enc=prepare_targets(y_train,y_test)
X_train_fs, X_test_fs = select_features(X_train_enc, y_train_enc, X_test_enc)
model=LogisticRegression(solver='lbfgs')
model.fit(X_train_fs,y_train_enc)
yhat=model.predict(X_test_fs)
accuracy=accuracy_score(y_test_enc,yhat)
print('Accuracy: %.2f' %(accuracy*100))

Accuracy: 77.89
