In [3]:
# ✅ Ridge Regression - House Price Prediction

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np

# ----------------------------
# 1. Load Dataset
# ----------------------------
housing = fetch_california_housing(as_frame=True)
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target, name="PRICE")

print("Dataset Shape:", X.shape)
print(X.head())

# ----------------------------
# 2. Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------------------
# 3. Ridge Regression Model
# ----------------------------
ridge = Ridge(alpha=1.0)  # alpha = λ (regularization strength)
ridge.fit(X_train, y_train)

# ----------------------------
# 4. Predictions & Evaluation
# ----------------------------
y_pred = ridge.predict(X_test)
print("\n✅ Ridge Regression Results:")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))
print("Coefficients:", ridge.coef_)


Dataset Shape: (20640, 8)
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  
0    -122.23  
1    -122.22  
2    -122.24  
3    -122.25  
4    -122.25  

✅ Ridge Regression Results:
Mean Squared Error: 0.5558034669932205
R2 Score: 0.5758549611440131
Coefficients: [ 4.48510924e-01  9.72596535e-03 -1.23014157e-01  7.81416761e-01
 -2.02581346e-06 -3.52585878e-03 -4.19786908e-01 -4.33680793e-01]


In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# ----------------------------
# 1. Load Stock Data from CSV
# ----------------------------
data = pd.read_csv("AAPL.csv")  # Ensure AAPL.csv is in the same folder
data.dropna(inplace=True)

# Rename columns if needed (Yahoo CSV may have spaces or different names)
data.columns = [col.strip() for col in data.columns]

# Convert 'Date' to datetime if needed and sort
data["Date"] = pd.to_datetime(data["Date"])
data.sort_values("Date", inplace=True)

print("Dataset Shape:", data.shape)
print(data.head())

# ----------------------------
# 2. Feature Engineering
# ----------------------------
data["Open_Close"] = data["Open"] - data["Close"]
data["High_Low"] = data["High"] - data["Low"]
data["Price_Change"] = data["Close"].pct_change()
data["Volume_Change"] = data["Volume"].pct_change()

data.dropna(inplace=True)

# Define Features & Target
X = data[["Open", "High", "Low", "Volume", "Open_Close", "High_Low", "Price_Change", "Volume_Change"]]
y = data["Close"]

# ----------------------------
# 3. Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# ----------------------------
# 4. Ridge Regression Model
# ----------------------------
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# ----------------------------
# 5. Predictions & Evaluation
# ----------------------------
y_pred = ridge.predict(X_test)
print("\n✅ Ridge Regression - Stock Price Prediction")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))
print("Coefficients:", ridge.coef_)

# ----------------------------
# 6. Next Day Closing Price Prediction (Example)
# ----------------------------
last_row = X.iloc[-1:].values
predicted_price = ridge.predict(last_row)
print("\nPredicted Next Closing Price:", predicted_price[0])


Dataset Shape: (782, 7)
        Date    Open    High    Low   Close  Adj Close    Volume
0 2022-01-03  129.96  185.04  98.23  154.29     176.12  44443130
1 2022-01-04  176.06  186.91  94.24  128.95     167.08  78668521
2 2022-01-05  158.56  183.63  96.68  147.49     164.41  37397770
3 2022-01-06  147.89  198.17  90.96  100.81     165.84  51252240
4 2022-01-07  112.48  191.67  96.24  150.89     174.62  49492703

✅ Ridge Regression - Stock Price Prediction
Mean Squared Error: 3.291322435848789e-08
R2 Score: 0.9999999999384154
Coefficients: [ 9.99985977e-01  4.35753838e-08 -9.92147735e-07 -2.83033122e-13
 -9.99988962e-01  1.03572678e-06  6.80467640e-04 -9.44056649e-06]

Predicted Next Closing Price: 106.49011467733104


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [19]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Sample car dataset (you can use any CSV with similar columns)
# url = "https://raw.githubusercontent.com/amankharwal/Website-data/master/car%20price%20prediction.csv"
df = pd.read_csv("Car_prediction.csv")

# Feature selection
X = df[["year", "selling_price", "km_driven", "engine", "max_power"]].fillna(0)
y = df["selling_price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

print("\n✅ Car Price Estimation")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))



✅ Car Price Estimation
MSE: 3.703033227817933e-19
R2: 1.0


In [23]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/Advertising.csv"
data = pd.read_csv("Advertising_5000_rows.csv")

X = data[["TV", "Radio", "Newspaper"]]  # features
y = data["Sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge = Ridge(alpha=0.5)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

print("\n✅ Sales Forecasting")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))



✅ Sales Forecasting
MSE: 1.0075477678519802
R2: 0.9676379845280252


In [26]:
import seaborn as sns

data = pd.read_csv("insurance_5000_rows.csv")  # built-in dataset

# Encode categorical
data = pd.get_dummies(data, drop_first=True)

X = data.drop("charges", axis=1)
y = data["charges"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

print("\n✅ Insurance Premium Prediction")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))



✅ Insurance Premium Prediction
MSE: 934555.4492739735
R2: 0.967203890737032


# #️⃣ LASSO REGRESSION PROJECTS (1 – 10)

# ✅ 1. Feature Selection for House Price Prediction
__Formula Explanation__
__Lasso modifies the cost function of linear regression by adding absolute value penalties:__

In [60]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

housing = fetch_california_housing(as_frame=True)
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

print("✅ House Price Feature Selection")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

# Important selected features
selected_features = pd.Series(lasso.coef_, index=X.columns)
print("\nSelected Features (Non-zero Coefficients):")
print(selected_features[selected_features != 0])


✅ House Price Feature Selection
MSE: 0.6135115198058131
R2: 0.5318167610318159

Selected Features (Non-zero Coefficients):
MedInc        0.392693
HouseAge      0.015081
Population    0.000016
AveOccup     -0.003149
Latitude     -0.114291
Longitude    -0.099308
dtype: float64


# ✅ 2. Gene Selection in Medical Research
__(Selects important genes influencing disease risk)__

In [51]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

# Simulated dataset (100 samples, 10 gene features)
np.random.seed(42)
X = pd.DataFrame(np.random.rand(100, 10), columns=[f"Gene_{i}" for i in range(1, 11)])
y = 3*X["Gene_3"] + 2*X["Gene_7"] + np.random.randn(100)  # only Gene_3 & Gene_7 matter

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)

selected_genes = pd.Series(lasso.coef_, index=X.columns)
print("\n✅ Important Genes:")
print(selected_genes[selected_genes != 0])


✅ Important Genes:
Gene_3    1.777842
Gene_7    1.094487
dtype: float64


# ✅ 3. Credit Risk Scoring
__(Selects important financial factors predicting loan default)__

In [52]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
# url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
df = pd.read_csv("Credit_Risk_Scoring_Important_Features.csv").dropna()

X = df.drop("Balance", axis=1)
X = pd.get_dummies(X, drop_first=True)
y = df["Balance"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

selected_features = pd.Series(lasso.coef_, index=X.columns)
print("\n✅ Important Credit Risk Factors:")
print(selected_features[selected_features != 0])

KeyError: "['Balance'] not found in axis"

# ✅ 4. Customer Churn Prediction
__(Selects key features causing customer churn)__

In [53]:
url = "https://raw.githubusercontent.com/blastchar/telco-customer-churn/master/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(url)

# Encode data
df = df.dropna()
df = pd.get_dummies(df, drop_first=True)
X = df.drop("Churn_Yes", axis=1)
y = df["Churn_Yes"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)

selected_features = pd.Series(lasso.coef_, index=X.columns)
print("\n✅ Important Features for Customer Churn:")
print(selected_features[selected_features != 0])


URLError: <urlopen error [Errno 11001] getaddrinfo failed>

# ✅ 5. Loan Default Prediction
__(Identifies important borrower features)__

In [54]:
url = "https://raw.githubusercontent.com/selva86/datasets/master/GermanCredit.csv"
df = pd.read_csv(url)

df = pd.get_dummies(df, drop_first=True)
X = df.drop("Class_Good", axis=1)
y = df["Class_Good"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)

selected_features = pd.Series(lasso.coef_, index=X.columns)
print("\n✅ Important Loan Default Predictors:")
print(selected_features[selected_features != 0])


URLError: <urlopen error [Errno 11001] getaddrinfo failed>

# ✅ 6. Startup Success Prediction
__(Identifies key features affecting funding success)__

In [55]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split

np.random.seed(42)
X = pd.DataFrame({
    "Team_Size": np.random.randint(1,50,100),
    "Funding_Rounds": np.random.randint(1,10,100),
    "Market_Size": np.random.rand(100)*100,
    "Competition_Level": np.random.rand(100)*10
})
y = 0.8*X["Team_Size"] + 1.5*X["Market_Size"] + np.random.randn(100)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

print("\n✅ Important Startup Success Predictors:")
print(pd.Series(lasso.coef_, index=X.columns))



✅ Important Startup Success Predictors:
Team_Size            0.800333
Funding_Rounds      -0.000000
Market_Size          1.499549
Competition_Level   -0.028452
dtype: float64


# ✅ 7. Feature Selection for Sentiment Analysis
__(Selects important words affecting sentiment score)__

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
texts = ["good product", "bad quality", "excellent service", "worst ever", "happy customer"]
y = [1, 0, 1, 0, 1]

cv = CountVectorizer()
X = cv.fit_transform(texts).toarray()
X = pd.DataFrame(X, columns=cv.get_feature_names_out())

lasso = Lasso(alpha=0.01)
lasso.fit(X, y)

print("\n✅ Important Words for Sentiment:")
print(pd.Series(lasso.coef_, index=X.columns))



✅ Important Words for Sentiment:
bad         -0.716987
customer     0.000000
ever        -0.798719
excellent    0.000000
good         0.000000
happy        0.000000
product      0.000000
quality     -0.199680
service      0.000000
worst       -0.117947
dtype: float64


# ✅ 8. Disease Risk Prediction
__(Selects important symptoms predicting disease)__

In [57]:
np.random.seed(42)
X = pd.DataFrame(np.random.rand(100,8), columns=[f"Symptom_{i}" for i in range(1,9)])
y = 4*X["Symptom_2"] + 3*X["Symptom_5"] + np.random.randn(100)

lasso = Lasso(alpha=0.1)
lasso.fit(X, y)

print("\n✅ Important Symptoms:")
print(pd.Series(lasso.coef_, index=X.columns))



✅ Important Symptoms:
Symptom_1    0.000000
Symptom_2    3.340880
Symptom_3    0.000000
Symptom_4    0.000000
Symptom_5    2.160574
Symptom_6   -0.000000
Symptom_7   -0.000000
Symptom_8   -0.000000
dtype: float64


# ✅ 9. Salary Prediction based on Multiple Skills
__(Selects most impactful skills)__

In [58]:
np.random.seed(42)
X = pd.DataFrame({
    "Python": np.random.randint(0,10,50),
    "Java": np.random.randint(0,10,50),
    "SQL": np.random.randint(0,10,50),
    "ML": np.random.randint(0,10,50)
})
y = 5000*X["Python"] + 7000*X["ML"] + np.random.randn(50)*1000

lasso = Lasso(alpha=1000)
lasso.fit(X, y)

print("\n✅ Important Skills for Salary:")
print(pd.Series(lasso.coef_, index=X.columns))



✅ Important Skills for Salary:
Python    4910.737672
Java        -0.000000
SQL         -0.000000
ML        6872.264979
dtype: float64


# ✅ 10. Customer Lifetime Value (CLV) Prediction
__(Selects important behavioral factors)__

In [59]:
np.random.seed(42)
X = pd.DataFrame({
    "Purchase_Frequency": np.random.randint(1,20,100),
    "Average_Order_Value": np.random.rand(100)*100,
    "Website_Visits": np.random.randint(1,50,100),
    "Discount_Usage": np.random.rand(100)*10
})
y = 10*X["Purchase_Frequency"] + 5*X["Average_Order_Value"] + np.random.randn(100)

lasso = Lasso(alpha=0.5)
lasso.fit(X, y)

print("\n✅ Important CLV Factors:")
print(pd.Series(lasso.coef_, index=X.columns))



✅ Important CLV Factors:
Purchase_Frequency     9.967019
Average_Order_Value    5.003885
Website_Visits        -0.004077
Discount_Usage         0.000000
dtype: float64
