In [212]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('train.csv')
data['Col 2'] = data['Col 2'].astype('category')
data['Col 2'] = data['Col 2'].cat.codes

test_data = pd.read_csv('x_test.csv')
test_data['Col 2'] = test_data['Col 2'].astype('category')
test_data['Col 2'] = test_data['Col 2'].cat.codes

y = data['y']
X = data.drop(['y','id'], axis=1)
X_test = test_data.drop(['id'], axis=1)

# Feature Selection
from sklearn.feature_selection import SelectKBest, f_regression
selector = SelectKBest(f_regression, k=10)
selector.fit(X, y)
X = X[X.columns[selector.get_support(indices=True)]]
X_test = X_test[X_test.columns[selector.get_support(indices=True)]]

# Feature Selection with polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X = poly.fit_transform(X)
X_test = poly.fit_transform(X_test)



from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

models = [
    {'name': 'Lasso Regression', 'model': Lasso(), 'params': {'alpha': [0.025, 0.05, 0.075, 0.1, 0.2, 0.3],'fit_intercept': [True, False]}},
]

for modell in models:
    # Cross Validation with params
    grid = GridSearchCV(modell['model'], modell['params'], cv=5, scoring='r2')
    grid.fit(x_train, y_train)
    
    # Now train with best params
    model = modell['model'].set_params(**grid.best_params_)
    model.fit(x_train, y_train)

    # Predict
    y_pred = model.predict(x_test)

    # Evaluate
    print(model)
    print('R2 Score: ', r2_score(y_test, y_pred))
    print('MSE: ', mean_squared_error(y_test, y_pred))
    print('CV Score R2: ', cross_val_score(model, x_train, y_train, cv=5, scoring='r2').mean())
    print('CV Score MSE: ', abs(cross_val_score(model, x_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()))
    print('-------------------------------------------')

    # Predict on test data
    y_pred = model.predict(X_test)
    test_data['y'] = y_pred
    test_data[['id', 'y']].to_csv(f'{modell["name"]}.csv', index=False)







Lasso(alpha=0.025, fit_intercept=False)
R2 Score:  0.3654439527393629
MSE:  3498.7778284305787
CV Score R2:  0.4601424921091479
CV Score MSE:  3085.3431807930397
-------------------------------------------
