In [1]:
import numpy as np
import pandas as pd

#### part1 - Cat in the Dat

In [2]:
df_train = pd.read_csv(r"C:\Users\laava\Downloads\cat-in-the-dat-ii\train.csv")
df_test = pd.read_csv(r"C:\Users\laava\Downloads\cat-in-the-dat-ii\test.csv")

In [None]:
df_train.isnull().sum()

id           0
bin_0    11901
bin_1    12038
bin_2    11972
bin_3    11951
bin_4    11951
nom_0    12062
nom_1    11947
nom_2    12179
nom_3    12176
nom_4    11993
nom_5    11912
nom_6    12012
nom_7    12003
nom_8    11956
nom_9    12060
ord_0    11893
ord_1    12167
ord_2    12105
ord_3    12053
ord_4    11933
ord_5    12047
day      12025
month    11984
dtype: int64

In [4]:
df_test.isnull().sum()

id           0
bin_0    11901
bin_1    12038
bin_2    11972
bin_3    11951
bin_4    11951
nom_0    12062
nom_1    11947
nom_2    12179
nom_3    12176
nom_4    11993
nom_5    11912
nom_6    12012
nom_7    12003
nom_8    11956
nom_9    12060
ord_0    11893
ord_1    12167
ord_2    12105
ord_3    12053
ord_4    11933
ord_5    12047
day      12025
month    11984
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Separate target variable and features
X = df_train.drop(columns=['target'])
y = df_train['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Define preprocessing for numerical columns (impute missing values and scale)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical columns (impute missing values and one-hot encode)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Model accuracy: {accuracy}')

# Track feature importance
feature_importances = model.named_steps['classifier'].feature_importances_
feature_names = model.named_steps['preprocessor'].transformers_[0][1].named_steps['onehot'].get_feature_names_out(categorical_cols)
feature_names = list(numerical_cols) + list(feature_names)
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

Model accuracy: 0.8138333333333333


KeyError: 'onehot'

In [6]:
df_gem_train = pd.read_csv(r'C:\Users\laava\Desktop\sem 6\AOML\train.csv')
df_gem_test = pd.read_csv(r'C:\Users\laava\Desktop\sem 6\AOML\test.csv')

In [9]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Separate target variable and features
X_gem = df_gem_train.drop(columns=['price'])
y_gem = df_gem_train['price']

# Identify categorical and numerical columns
categorical_cols_gem = X_gem.select_dtypes(include=['object']).columns
numerical_cols_gem = X_gem.select_dtypes(include=['int64', 'float64']).columns

# Define preprocessing for numerical columns (impute missing values and scale)
numerical_transformer_gem = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical columns (impute missing values and one-hot encode)
categorical_transformer_gem = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor_gem = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_gem, numerical_cols_gem),
        ('cat', categorical_transformer_gem, categorical_cols_gem)
    ])

# Define the model
model_gem = Pipeline(steps=[
    ('preprocessor', preprocessor_gem),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Split the data into training and validation sets
X_train_gem, X_val_gem, y_train_gem, y_val_gem = train_test_split(X_gem, y_gem, test_size=0.2, random_state=42)

# Train the model
model_gem.fit(X_train_gem, y_train_gem)

# Make predictions
y_pred_gem = model_gem.predict(X_val_gem)

# Evaluate the model
mse = mean_squared_error(y_val_gem, y_pred_gem)
print(f'Model MSE: {mse}')

# Define parameter grid for GridSearchCV
param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_features': ['auto', 'sqrt', 'log2'],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Define parameter distribution for RandomizedSearchCV
param_dist = {
    'regressor__n_estimators': [int(x) for x in np.linspace(start=50, stop=200, num=10)],
    'regressor__max_features': ['auto', 'sqrt', 'log2'],
    'regressor__max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=model_gem, param_grid=param_grid, cv=3, n_jobs=1, verbose=2)
grid_search.fit(X_train_gem, y_train_gem)
print(f'Best parameters from GridSearchCV: {grid_search.best_params_}')
print(f'Best score from GridSearchCV: {grid_search.best_score_}')

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model_gem, param_distributions=param_dist, n_iter=100, cv=3, n_jobs=1, verbose=2, random_state=42)
random_search.fit(X_train_gem, y_train_gem)
print(f'Best parameters from RandomizedSearchCV: {random_search.best_params_}')
print(f'Best score from RandomizedSearchCV: {random_search.best_score_}')

Model MSE: 361859.3794106909
Fitting 3 folds for each of 324 candidates, totalling 972 fits
[CV] END regressor__max_depth=None, regressor__max_features=auto, regressor__min_samples_leaf=1, regressor__min_samples_split=2, regressor__n_estimators=50; total time=   0.0s
[CV] END regressor__max_depth=None, regressor__max_features=auto, regressor__min_samples_leaf=1, regressor__min_samples_split=2, regressor__n_estimators=50; total time=   0.0s
[CV] END regressor__max_depth=None, regressor__max_features=auto, regressor__min_samples_leaf=1, regressor__min_samples_split=2, regressor__n_estimators=50; total time=   0.0s
[CV] END regressor__max_depth=None, regressor__max_features=auto, regressor__min_samples_leaf=1, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   0.0s
[CV] END regressor__max_depth=None, regressor__max_features=auto, regressor__min_samples_leaf=1, regressor__min_samples_split=2, regressor__n_estimators=100; total time=   0.0s
[CV] END regressor__max_de

324 fits failed out of a total of 972.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
324 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\laava\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\laava\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laava\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\laava\anaconda3\Lib\site-packages\sklearn\base.py", li

Best parameters from GridSearchCV: {'regressor__max_depth': None, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 200}
Best score from GridSearchCV: 0.9781359256893363
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END regressor__max_depth=90, regressor__max_features=sqrt, regressor__min_samples_leaf=4, regressor__min_samples_split=10, regressor__n_estimators=50; total time=  10.1s
[CV] END regressor__max_depth=90, regressor__max_features=sqrt, regressor__min_samples_leaf=4, regressor__min_samples_split=10, regressor__n_estimators=50; total time=  10.4s
[CV] END regressor__max_depth=90, regressor__max_features=sqrt, regressor__min_samples_leaf=4, regressor__min_samples_split=10, regressor__n_estimators=50; total time=  10.1s
[CV] END regressor__max_depth=50, regressor__max_features=log2, regressor__min_samples_leaf=1, regressor__min_samples_split=5, regressor__n_estimators=183; tota

108 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\laava\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\laava\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laava\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\laava\anaconda3\Lib\site-packages\sklearn\base.py", li

Best parameters from RandomizedSearchCV: {'regressor__n_estimators': 133, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'sqrt', 'regressor__max_depth': None}
Best score from RandomizedSearchCV: 0.97812415075866
