In [None]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
names = ["sample_code_number", "clump_thickness", "uniformity_of_cell_size", "uniformity_of_cell_shape", 
         "marginal_adhesion", "single_epithelial_cell_size", "bare_nuclei", "bland_chromatin", "normal_nucleoli", 
         "mitosis", "classes"]
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values

In [None]:
dataframe[dataframe.bare_nuclei == "?"] = 0

In [None]:
dataframe.shape

In [None]:
train_len = int(dataframe.shape[0] * 0.8)

In [None]:
X = dataframe.values[:train_len,2:6]

Y = dataframe.values[:train_len,-1].astype(int)

In [None]:
test_X = dataframe.values[train_len:,2:6]
test_Y = dataframe.values[train_len:,-1].astype(int)

In [None]:
X.shape

In [None]:
# feature extraction

test = SelectKBest(score_func=chi2, k='all')

fit = test.fit(X, Y)

In [None]:
# summarize scores

numpy.set_printoptions(precision=3)

print(fit.scores_)

In [None]:
features = fit.transform(X)

# summarize selected features

print(features[:,:])

In [None]:
# Feature Extraction with RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [None]:
# feature extraction

model = LogisticRegression()

rfe = RFE(model, 3)

fit = rfe.fit(X, Y)

print("Num Features:", fit.n_features_)
# print(fit.n_features_)
print("Selected Features:", fit.support_)

print("Feature Ranking:", fit.ranking_)

In [None]:
# Feature Extraction with PCA
from sklearn.decomposition import PCA

# feature extraction

pca = PCA(n_components=3)

fit = pca.fit(X)

features = fit.transform(X)

# summarize components

print("Explained Variance:", fit.explained_variance_ratio_)

print(features[:,:])

In [None]:
# Feature Importance with Extra Trees Classifier
from sklearn.ensemble import ExtraTreesClassifier


# feature extraction

model = ExtraTreesClassifier()

model.fit(X, Y)

print(model.feature_importances_)

In [None]:
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import matplotlib
%matplotlib inline

In [None]:
df = dataframe.astype(int)

In [None]:
sns.pairplot(df, hue="classes", markers=["o", "s", "D"])

In [None]:
preds = model.predict(test_X)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_pred=preds, y_true=test_Y)

Task:

In [None]:
salaries = pandas.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data")

In [None]:
salaries_test = pandas.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test")