In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Sample DataFrame
data = {
    'feature1': [1, 2, 3, 4, 5],
    'feature2': [10, 20, 30, 40, 50],
    'target': [15, 25, 35, 45, 55]
}
df = pd.DataFrame(data)

# Split features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize and fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

print("Predictions:", y_pred)

# With scaling input variables

In [None]:
from sklearn.preprocessing import StandardScaler
# Sample DataFrame
data = {
    'feature1': [1, 2, 3, 4, 5],
    'feature2': [10, 20, 30, 40, 50],
    'target': [15, 25, 35, 45, 55]
}
df = pd.DataFrame(data)

# Split features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data, then transform both training and test sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optionally, convert back to a DataFrame to keep the column names
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

# Initialize and fit the model using the scaled data
model = LinearRegression()
model.fit(X_train_scaled_df, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled_df)

print("Predictions:", y_pred)

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

# Sample DataFrame
data = {
    'feature1': [1, 2, 3, 4, 5],
    'feature2': [10, 20, 30, 40, 50],
    'target': [15, 25, 35, 45, 55]
}
df = pd.DataFrame(data)

# Split features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Create a pipeline with StandardScaler and LinearRegression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Perform cross-validation
# Use scoring='neg_mean_squared_error' for regression; you can change the scoring metric as needed
scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')

# Convert negative MSE scores to positive values for readability
mse_scores = -scores
print("Cross-validated MSE scores:", mse_scores)
print("Average MSE:", mse_scores.mean())

Cross-validated MSE scores: [1.13595970e-28 0.00000000e+00 0.00000000e+00 5.04870979e-29
 2.01948392e-28]
Average MSE: 7.320629200450989e-29


## Ridge Regression

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# Step 1: Generate a synthetic time series dataset
np.random.seed(0)
n_points = 100
date_range = pd.date_range(start='2020-01-01', periods=n_points, freq='D')
data = {
    'date': date_range,
    'feature1': np.sin(np.linspace(0, 20, n_points)) + np.random.normal(scale=0.1, size=n_points),
    'feature2': np.cos(np.linspace(0, 20, n_points)) + np.random.normal(scale=0.1, size=n_points),
    'target': np.sin(np.linspace(0, 20, n_points)) + 0.5 * np.cos(np.linspace(0, 10, n_points)) + np.random.normal(scale=0.1, size=n_points)
}
df = pd.DataFrame(data).set_index('date')

# Step 2: Split features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Step 3: Initialize TimeSeriesSplit for time series cross-validation
tss = TimeSeriesSplit(n_splits=5)

# Step 4: Define a parameter grid for alpha values for Ridge regression
param_grid = {'ridge__alpha': [10**x for x in range(-4, 7)]}

# Step 5: Create a pipeline with StandardScaler and Ridge regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# Step 6: Use GridSearchCV to find the optimal alpha value
ridge_cv = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=tss, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_cv.fit(X, y)

# Step 7: Make predictions using the best estimator from the grid search
Y_pred_ridge = ridge_cv.predict(X)

# Step 8: Print evaluation metrics
print(f"MAE: {mean_absolute_error(y, Y_pred_ridge):.4f}")
print(f"MSE: {mean_squared_error(y, Y_pred_ridge):.4f}")
print(f"R^2: {r2_score(y, Y_pred_ridge):.4f}")
print(f"Best alpha: {ridge_cv.best_estimator_.named_steps['ridge'].alpha}")

# Step 9: Display the coefficients for each feature and the intercept
best_model = ridge_cv.best_estimator_.named_steps['ridge']
print("Coefficients for each feature:")
for feature, coef in zip(X.columns, best_model.coef_):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {best_model.intercept_:.4f}")

MAE: 0.3221
MSE: 0.1418
R^2: 0.8039
Best alpha: 0.0001
Coefficients for each feature:
feature1: 0.7657
feature2: -0.0513
Intercept: 0.0013


In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# Step 1: Generate a synthetic time series dataset
np.random.seed(0)
n_points = 100
date_range = pd.date_range(start='2020-01-01', periods=n_points, freq='D')
data = {
    'date': date_range,
    'feature1': np.sin(np.linspace(0, 20, n_points)) + np.random.normal(scale=0.1, size=n_points),
    'feature2': np.cos(np.linspace(0, 20, n_points)) + np.random.normal(scale=0.1, size=n_points),
    'target': np.sin(np.linspace(0, 20, n_points)) + 0.5 * np.cos(np.linspace(0, 10, n_points)) + np.random.normal(scale=0.1, size=n_points)
}
df = pd.DataFrame(data).set_index('date')

# Step 2: Split features and target
X = df[['feature1', 'feature2']]
y = df['target']

# Step 3: Initialize TimeSeriesSplit for time series cross-validation
tss = TimeSeriesSplit(n_splits=5)

# Step 4: Define a parameter grid for alpha values for Lasso regression
param_grid = {'lasso__alpha': [10**x for x in range(-4, 7)]}

# Step 5: Create a pipeline with StandardScaler and Lasso regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso())
])

# Step 6: Use GridSearchCV to find the optimal alpha value
lasso_cv = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=tss, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_cv.fit(X, y)

# Step 7: Make predictions using the best estimator from the grid search
Y_pred_lasso = lasso_cv.predict(X)

# Step 8: Print evaluation metrics
print(f"MAE: {mean_absolute_error(y, Y_pred_lasso):.4f}")
print(f"MSE: {mean_squared_error(y, Y_pred_lasso):.4f}")
print(f"R^2: {r2_score(y, Y_pred_lasso):.4f}")
print(f"Best alpha: {lasso_cv.best_estimator_.named_steps['lasso'].alpha}")

# Step 9: Display the coefficients for each feature and the intercept
best_model = lasso_cv.best_estimator_.named_steps['lasso']
print("Coefficients for each feature:")
for feature, coef in zip(X.columns, best_model.coef_):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {best_model.intercept_:.4f}")

MAE: 0.3432
MSE: 0.1545
R^2: 0.7865
Best alpha: 0.1
Coefficients for each feature:
feature1: 0.6609
feature2: -0.0000
Intercept: 0.0013
