In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# data = pd.read_csv('train.csv')
# data['Col 2'] = data['Col 2'].astype('category')
# data['Col 2'] = data['Col 2'].cat.codes

# test_data = pd.read_csv('x_test.csv')
# test_data['Col 2'] = test_data['Col 2'].astype('category')
# test_data['Col 2'] = test_data['Col 2'].cat.codes

y = data['y']
X = data.drop(['y','id'], axis=1)
X_test = test_data.drop(['id'], axis=1)

# Feature Selection
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(f_regression, k=4)
selector.fit(X, y)
X = X[X.columns[selector.get_support(indices=True)]]
X_test = X_test[X_test.columns[selector.get_support(indices=True)]]

# Feature Selection with polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X = poly.fit_transform(X)
X_test = poly.fit_transform(X_test)



from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

models = [
    {'name': 'Lasso Regression', 'model': Lasso(), 'params': {'alpha': [0.1, 0.5, 1, 2, 5, 10, 20, 50, 100]}},
    {'name': 'Random Forest', 'model': RandomForestRegressor(), 'params': {'bootstrap': [True], 'max_depth': [4], 'max_features': ['auto'], 'min_samples_leaf': [5], 'min_samples_split': [3], 'n_estimators': [311],}},
    {'name': 'Gradient Boosting Regressor', 'model': GradientBoostingRegressor(), 'params': {'n_estimators':[350], 'min_samples_split':[15], 'max_depth':[1], 'learning_rate':[0.01],}},
]

for modell in models:
    # Cross Validation with params
    grid = GridSearchCV(modell['model'], modell['params'], cv=5, scoring='r2')
    grid.fit(x_train, y_train)
    
    # Now train with best params
    model = modell['model'].set_params(**grid.best_params_)
    model.fit(x_train, y_train)

    # Predict
    y_pred = model.predict(x_test)

    # Evaluate
    print(model)
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('CV Score R2: ', cross_val_score(model, x_train, y_train, cv=5, scoring='r2').mean())
    print('CV Score MSE: ', abs(cross_val_score(model, x_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()))
    print('-------------------------------------------')

    # # Predict on test data
    # y_pred = model.predict(x_test)
    # test_data['y'] = y_pred
    # test_data[['id', 'y']].to_csv(f'{modell["name"]}.csv', index=False)







Lasso(alpha=0.1)
R2 Score:  0.4778632052932068
MSE:  2597.8243153674566
CV Score R2:  0.3825445230582888
CV Score MSE:  3669.9434656308963
-------------------------------------------
RandomForestRegressor(max_depth=4, max_features='auto', min_samples_leaf=5,
                      min_samples_split=3, n_estimators=311)
R2 Score:  0.35319620748222946
MSE:  3218.088892620788
CV Score R2:  0.40201606839922865
CV Score MSE:  3473.483656161877
-------------------------------------------
GradientBoostingRegressor(learning_rate=0.01, max_depth=1, min_samples_split=15,
                          n_estimators=350)
R2 Score:  0.4208316159632409
MSE:  2881.57763635039
CV Score R2:  0.3743878054734526
CV Score MSE:  3679.8105224847204
-------------------------------------------


In [47]:
from sklearn.ensemble import VotingRegressor

y = data['y']
X = data.drop(['y','id'], axis=1)
X_test = test_data.drop(['id'], axis=1)
# Feature Selection
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(f_regression, k=4)
selector.fit(X, y)
X = X[X.columns[selector.get_support(indices=True)]]
X_test = X_test[X_test.columns[selector.get_support(indices=True)]]

# Feature Selection with polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X = poly.fit_transform(X)
X_test = poly.fit_transform(X_test)



from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Define the individual regressor models
lasso_model = Lasso(alpha=0.1)  # You can adjust alpha as needed
random_forest_model = RandomForestRegressor(bootstrap=True, max_depth=4, max_features='auto', min_samples_leaf=5, min_samples_split=3, n_estimators=311)
gradient_boosting_model = GradientBoostingRegressor(n_estimators=350, min_samples_split=15, max_depth=1, learning_rate=0.01)

# Create the Voting Regressor model
voting_regressor = VotingRegressor(estimators=[
    ('Lasso', lasso_model),
    ('RandomForest', random_forest_model),
    ('GradientBoosting', gradient_boosting_model)
])

# Fit the Voting Regressor model
voting_regressor.fit(x_train, y_train)

# Predict
y_pred = voting_regressor.predict(x_test)

# Evaluate
print("Voting Regressor")
print('R2 Score: ', r2_score(y_test, y_pred))
print('MSE: ', mean_squared_error(y_test, y_pred))

Voting Regressor
R2 Score:  0.48926304582296964
MSE:  3298.709833626974


In [56]:
# Predict on test data
y_pred = voting_regressor.predict(X_test)
test_data['y'] = y_pred
test_data[['id', 'y']].to_csv('Brian_Voting_4.csv', index=False)

In [50]:
test_data = pd.read_csv('x_test.csv')
# test_data['Col 2'] = test_data['Col 2'].astype('category')
# test_data['Col 2'] = test_data['Col 2'].cat.codes

X_test = test_data.drop(['id'], axis=1)

# Feature Selection
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(f_regression, k=4)
selector.fit(X, y)
X = X[X.columns[selector.get_support(indices=True)]]
X_test = X_test[X_test.columns[selector.get_support(indices=True)]]

# Feature Selection with polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X = poly.fit_transform(X)
X_test = poly.fit_transform(X_test)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [54]:
X_test[0]

array([ 1.00000000e+00, -6.22521820e-02, -7.45280244e-02,  4.46044580e-03,
       -3.58167281e-02,  3.87533416e-03,  4.63953214e-03, -2.77672484e-04,
        2.22966948e-03,  5.55442643e-03, -3.32428214e-04,  2.66934999e-03,
        1.98955767e-05, -1.59758574e-04,  1.28283801e-03])

![Alt text](image.png)
3278