<a href="https://colab.research.google.com/github/owend23/ML/blob/main/data_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import samples_generator
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline



In [2]:
# generate sample data
X, y = samples_generator.make_classification(
    n_informative=4, n_features=20, n_redundant=0,
    random_state=5)

In [3]:
# Feature selector
selector_k_best = SelectKBest(f_regression, k=10)

In [8]:
# Random forest classifier
classifier = RandomForestClassifier(n_estimators=50, max_depth=4)

In [10]:
# Build the machine learning pipeline
pipeline_classifier = Pipeline([('selector', selector_k_best),
                                ('rf', classifier)])

In [11]:
# Training the classifier
pipeline_classifier.fit(X, y)

Pipeline(memory=None,
         steps=[('selector',
                 SelectKBest(k=10,
                             score_func=<function f_regression at 0x7f1a83dd2170>)),
                ('rf',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=4, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=50, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
    

In [13]:
prediction = pipeline_classifier.predict(X)
print("Predictions:\n", prediction)

Predictions:
 [1 1 0 1 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 1
 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 0 0 0 1 1 1 0 0 1 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 0 1]


In [14]:
# Print score
print("Score:", pipeline_classifier.score(X, y))

Score: 0.99


In [15]:
# Print selected features chosen by the selector
features_status = pipeline_classifier.named_steps['selector'].get_support()
selected_features = []
for count, item in enumerate(features_status):
  if item:
    selected_features.append(count)

print("Selected features (0-indexed):", ', '.join([str(x) for x in selected_features]))

Selected features (0-indexed): 0, 5, 7, 9, 10, 11, 13, 15, 16, 18
