### Practical Coding

1. Classification and Lasso Regression

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

data = load_breast_cancer()
X,y = pd.DataFrame(data.data, columns=data.feature_names), data.target

X_scaled = StandardScaler().fit_transform(X)

In [None]:
from sklearn.linear_model import LassoCV

lasso = LassoCV(cv=5)
lasso.fit(X, y)

selected_features = X.columns[(lasso.coef_ != 0)]
selected_features = X.columns[(lasso.coef_ != 0)]
print("Selected Features: ", list(selected_features))

Selected Features:  ['mean area', 'worst texture', 'worst perimeter', 'worst area']


In [None]:
X_selected = pd.DataFrame(X_scaled, columns=X.columns)[selected_features]

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

In [15]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9385964912280702
              precision    recall  f1-score   support

           0       0.97      0.86      0.91        43
           1       0.92      0.99      0.95        71

    accuracy                           0.94       114
   macro avg       0.95      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



In [16]:
roc_auc_score(y_test, y_proba)

0.9942679331804782

In [20]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist ={
    'n_estimators': randint(20, 100),
    'max_depth': randint(3, 10)
}

search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=10, cv=3, random_state=42)
search.fit(X_train, y_train)
print("Best params: ", search.best_params_)

Best params:  {'max_depth': 7, 'n_estimators': 40}


In [22]:
importances = search.best_estimator_.feature_importances_
for f, i in zip(selected_features, importances):
    print(f'Feature: |{f}| and importance: |{i}|')

Feature: |mean area| and importance: |0.15409946070497377|
Feature: |worst texture| and importance: |0.0864145080805049|
Feature: |worst perimeter| and importance: |0.37508351706555043|
Feature: |worst area| and importance: |0.3844025141489708|


2. Over Sampling and Under Sampling


In [None]:
from sklearn.datasets import make_classification
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

X, y = make_classification(n_samples=1000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.9, 0.1], flip_y=0, random_state=42)
X, y = make_classification(n_samples=1000, n_features=2, n_clusters_per_class=1, weights=[0.9, 0.1],  )
Counter(y)

Counter({0: 900, 1: 100})

In [24]:
smote = SMOTE(random_state=42)
X_over, y_over = smote.fit_resample(X,y)
Counter(y_over)

Counter({0: 900, 1: 900})

In [25]:
under = RandomUnderSampler(random_state=42)
X_under, y_under = under.fit_resample(X, y)
Counter(y_under)

Counter({0: 100, 1: 100})

### Repeat Week Answer

In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [4]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [6]:
X = df[['pclass', 'sex', 'age', 'sibsp', 'fare', 'embarked']]
y = df['survived']

In [7]:
num_cols = X.select_dtypes(include=['number']).columns.to_list()
cat_cols = X.select_dtypes(include=['category', 'object']).columns.tolist()

In [8]:
preprocessor = ColumnTransformer([
    ('SimpleImputer1', SimpleImputer(), ['age']),
    ('cat', Pipeline([
        ('imputer2', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first')),
    ]), cat_cols)
], remainder='passthrough')

pipline = Pipeline([
    ('preprocessor', preprocessor),
    ('scale', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])

In [9]:
param_dist = {
    'clf__n_estimators': [50, 100, 120],
    'clf__max_depth': [None, 5, 20],
    'clf__min_samples_split': [2, 5, 10],
}

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2)

grid= GridSearchCV(pipline, param_grid=param_dist,cv=3)

grid.fit(X_train, y_train)

grid.best_estimator_

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [10]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       111
           1       0.77      0.72      0.74        68

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



In [11]:
accuracy_score(y_test, y_pred)

0.8100558659217877