In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


### Task 1


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

# Load the data
housing_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/housing.csv')

# Display the first few rows of the dataframe to identify non-numeric columns
print(housing_data.head())

# Identify non-numeric columns
non_numeric_columns = housing_data.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

# Drop non-numeric columns
housing_data = housing_data.drop(columns=non_numeric_columns)

# Check for any remaining non-numeric columns
print("Remaining columns after dropping non-numeric ones:", housing_data.columns)

# Split the data into features and target variable
X = housing_data.drop(columns=['Price'])  # 'Price' is the target variable
y = housing_data['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the linear regression model
lr = LinearRegression()

# Initialize the Sequential Feature Selector
sfs = SFS(lr,
          k_features='best',
          forward=True,
          floating=False,
          verbose=2,
          scoring='r2',
          cv=5)

# Fit the Sequential Feature Selector
sfs.fit(X_train, y_train)

# Print the results
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("Missing values in y_train:", y_train.isnull().sum())


   Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0       79545.45857             5.682861                   7.009188   
1       79248.64245             6.002900                   6.730821   
2       61287.06718             5.865890                   8.512727   
3       63345.24005             7.188236                   5.586729   
4       59982.19723             5.040555                   7.839388   

   Avg. Area Number of Bedrooms  Area Population         Price  \
0                          4.09      23086.80050  1.059034e+06   
1                          3.09      40173.07217  1.505891e+06   
2                          5.13      36882.15940  1.058988e+06   
3                          3.26      34310.24283  1.260617e+06   
4                          4.23      26354.10947  6.309435e+05   

                                             Address  
0  208 Michael Ferry Apt. 674\nLaurabury, NE 3701...  
1  188 Johnson Views Suite 079\nLake Kathleen, CA...  
2  9127 Eli


[2024-05-23 18:13:18] Features: 1/5 -- score: 0.41110861934095144
[2024-05-23 18:13:19] Features: 2/5 -- score: 0.6136494236858007
[2024-05-23 18:13:19] Features: 3/5 -- score: 0.7974378075163188
[2024-05-23 18:13:19] Features: 4/5 -- score: 0.9173756833163236
[2024-05-23 18:13:19] Features: 5/5 -- score: 0.9174131909724943

### Task 2


In [8]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

lr = LogisticRegression(class_weight='balanced', solver='lbfgs', random_state=42, n_jobs=-1, max_iter=500)
lr.fit(X, y)

bfs = SFS(lr,
          k_features='best',
          forward=False,
          floating=False,
          verbose=2,
          scoring='accuracy',
          cv=5)

bfs.fit(X, y)

features = bfs.k_feature_names_
print("Selected features:", features)












Selected features: ('1', '3', '4', '5', '6', '7', '8')



[2024-05-23 18:38:54] Features: 1/1 -- score: 0.0045199182839632274

### Task 3

In [9]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
import pandas as pd

iris = load_iris()
X_df = pd.DataFrame(iris.data, columns=iris.feature_names)
y_series = iris.target

knn = KNeighborsClassifier(n_neighbors=3)

efs1 = EFS(knn, min_features=1, max_features=4, scoring='accuracy', print_progress=True, cv=5)

efs1.fit(X_df, y_series)

print(f'Best accuracy score: {efs1.best_score_:.2f}')
print(f'Best subset (indices): {efs1.best_idx_}')
print(f'Best subset (corresponding names): {efs1.best_feature_names_}')


Features: 15/15

Best accuracy score: 0.97
Best subset (indices): (0, 2, 3)
Best subset (corresponding names): ('sepal length (cm)', 'petal length (cm)', 'petal width (cm)')


### Task 4

In [10]:
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np

iris = load_iris()
X_df = pd.DataFrame(iris.data, columns=iris.feature_names)
y_series = iris.target

rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=2)

model = GradientBoostingClassifier()

pipe = Pipeline([('feature_selection', rfe), ('model', model)])

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=36851234)

n_scores = cross_val_score(pipe, X_df, y_series, scoring='accuracy', cv=cv, n_jobs=-1)

print("Mean Accuracy Score:", np.mean(n_scores))

pipe.fit(X_df, y_series)

selected_features = pipe.named_steps['feature_selection'].support_
print("Selected Features:", selected_features)

Mean Accuracy Score: 0.96
Selected Features: [False False  True  True]


### Task 5

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Lasso())
])

param_grid = {'model__alpha': np.arange(0.1, 10, 0.1)}

search = GridSearchCV(pipeline, param_grid, cv=5, scoring="neg_mean_squared_error", verbose=3)
search.fit(X_train, y_train)

coefficients = search.best_estimator_.named_steps['model'].coef_

importance = np.abs(coefficients)
print("Absolute coefficients:", importance)

selected_features_indices = np.where(importance > 0)[0]
print("Selected features indices:", selected_features_indices)

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV 1/5] END ..............model__alpha=0.1;, score=-3051.708 total time=   0.0s
[CV 2/5] END ..............model__alpha=0.1;, score=-3405.374 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.1;, score=-2593.337 total time=   0.0s
[CV 4/5] END ..............model__alpha=0.1;, score=-3309.628 total time=   0.0s
[CV 5/5] END ..............model__alpha=0.1;, score=-3512.110 total time=   0.0s
[CV 1/5] END ..............model__alpha=0.2;, score=-3057.672 total time=   0.0s
[CV 2/5] END ..............model__alpha=0.2;, score=-3413.561 total time=   0.0s
[CV 3/5] END ..............model__alpha=0.2;, score=-2606.353 total time=   0.0s
[CV 4/5] END ..............model__alpha=0.2;, score=-3317.427 total time=   0.0s
[CV 5/5] END ..............model__alpha=0.2;, score=-3502.696 total time=   0.0s
[CV 1/5] END model__alpha=0.30000000000000004;, score=-3057.759 total time=   0.0s
[CV 2/5] END model__alpha=0.30000000000000004

### Task 6

In [12]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

iris_data = load_iris()
X = iris_data.data
y = iris_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

feature_importances = rf.feature_importances_

print("Feature Importances:")
for i, importance in enumerate(feature_importances):
    print(f"Feature {i+1}: {importance:.4f}")

Feature Importances:
Feature 1: 0.0984
Feature 2: 0.0410
Feature 3: 0.4094
Feature 4: 0.4513
