In [13]:
#1
import numpy as np
import pandas as pd


In [3]:
# Step 1: Set a random seed for reproducibility
np.random.seed(42)

# Step 2: Define parameters
n_samples = 500      # number of data points
n_features = 7       # number of features

In [5]:
# Step 3: Create a covariance matrix to make features correlated
base_corr = 0.9
cov = np.full((n_features, n_features), base_corr)
np.fill_diagonal(cov, 1.0)  # diagonal = 1 (self-correlation)

# Step 4: Generate multivariate normal data (highly correlated)
mean = np.zeros(n_features)
X = np.random.multivariate_normal(mean, cov, size=n_samples)

In [7]:
# Step 5: Define true coefficients and bias
true_weights = np.array([2.5, -1.8, 1.2, 0.8, 0.5, 1.5, -0.7])
bias = 3.0

# Step 6: Generate target variable with some noise They define how the target variable (y) 
#is generated from your features (X) using a linear relationship + randomness
noise = np.random.normal(0, 1.5, size=n_samples)
y = X.dot(true_weights) + bias + noise

In [9]:
# Step 7: Create DataFrame
columns = [f'Feature_{i+1}' for i in range(n_features)]
df = pd.DataFrame(X, columns=columns)
df['Target'] = y

# Step 8: Display correlation matrix
print("Feature Correlation Matrix:")
print(df.corr())

Feature Correlation Matrix:
           Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Feature_6  \
Feature_1   1.000000   0.897950   0.889118   0.903796   0.896104   0.902124   
Feature_2   0.897950   1.000000   0.891180   0.906176   0.906239   0.898228   
Feature_3   0.889118   0.891180   1.000000   0.904157   0.905161   0.898464   
Feature_4   0.903796   0.906176   0.904157   1.000000   0.892401   0.911539   
Feature_5   0.896104   0.906239   0.905161   0.892401   1.000000   0.901148   
Feature_6   0.902124   0.898228   0.898464   0.911539   0.901148   1.000000   
Feature_7   0.895656   0.906879   0.899301   0.911495   0.894585   0.904232   
Target      0.897466   0.788642   0.864259   0.863332   0.846861   0.877392   

           Feature_7    Target  
Feature_1   0.895656  0.897466  
Feature_2   0.906879  0.788642  
Feature_3   0.899301  0.864259  
Feature_4   0.911495  0.863332  
Feature_5   0.894585  0.846861  
Feature_6   0.904232  0.877392  
Feature_7   1.000000  0.81478

In [11]:
# Step 9: Save dataset
df.to_csv("highly_correlated_dataset.csv", index=False)
print("\nDataset saved as 'highly_correlated_dataset.csv'")


Dataset saved as 'highly_correlated_dataset.csv'


In [15]:
#2
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Step 2: Load the dataset
df = pd.read_csv("Hitters (1).csv")
print("Initial shape:", df.shape)
df.head()


Initial shape: (322, 20)


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,293,66,1,30,29,14,1,293,66,1,30,29,14,A,E,446,33,20,,A
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N


In [17]:
# Step 3: Check for missing values
print("\nMissing values per column:\n")
print(df.isnull().sum())



Missing values per column:

AtBat         0
Hits          0
HmRun         0
Runs          0
RBI           0
Walks         0
Years         0
CAtBat        0
CHits         0
CHmRun        0
CRuns         0
CRBI          0
CWalks        0
League        0
Division      0
PutOuts       0
Assists       0
Errors        0
Salary       59
NewLeague     0
dtype: int64


In [19]:
# Step 4: Handle missing values
# Drop rows with missing Salary values (or you can fill with median)
df = df.dropna(subset=['Salary'])

# For any remaining missing values in other columns, fill with median
df = df.fillna(df.median(numeric_only=True))
print("\nShape after handling missing values:", df.shape)



Shape after handling missing values: (263, 20)


In [21]:
# Step 5: Handle categorical columns (encoding)
# Identify categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
print("\nCategorical columns:", list(cat_cols))

# Encode using LabelEncoder (works well for binary categories)
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

df.head()



Categorical columns: ['League', 'Division', 'NewLeague']


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,1,1,632,43,10,475.0,1
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,0,1,880,82,14,480.0,0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,1,0,200,11,3,500.0,1
4,321,87,10,39,42,30,2,396,101,12,48,46,33,1,0,805,40,4,91.5,1
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,0,1,282,421,25,750.0,0


In [23]:
# Step 6: Handle noise / outliers (optional but useful)
# Remove rows where Salary is extremely high (top 1%)
q_high = df['Salary'].quantile(0.99)
df = df[df['Salary'] < q_high]
print("\nShape after removing outliers:", df.shape)



Shape after removing outliers: (260, 20)


In [25]:
# Step 7: Normalize or scale numeric columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

print("\nScaled dataset preview:\n")
df.head()



Scaled dataset preview:



Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,-0.61243,-0.602266,-0.527676,-1.217607,-0.523989,-0.095386,1.400258,0.357574,0.186328,0.01732,-0.112325,0.280914,0.444448,1,1,1.241257,-0.522082,0.213087,-0.098305,1
2,0.516825,0.502221,0.731001,0.444634,0.812054,1.626222,-0.898673,-0.448079,-0.404087,-0.058152,-0.409049,-0.188027,0.017689,0,1,2.137358,-0.253764,0.816929,-0.086074,0
3,0.633882,0.750167,0.959851,0.405057,1.047827,-0.188446,0.773277,1.319502,1.342168,1.979586,1.438594,1.624367,0.364431,1,0,-0.319695,-0.742241,-0.843637,-0.03715,1
4,-0.571116,-0.467023,-0.184401,-0.623949,-0.366807,-0.514156,-1.107667,-0.990184,-0.96014,-0.699662,-0.947435,-0.885102,-0.858692,1,0,1.86636,-0.542722,-0.692677,-1.03641,1
5,1.30868,1.381302,-0.870951,0.761252,-0.013149,-0.281506,0.773277,0.780928,0.651788,-0.611612,0.438298,0.033769,-0.245225,0,1,-0.023403,2.078539,2.477495,0.574391,0


In [27]:
# Step 8: Verify final dataset
print("\nFinal dataset info:\n")
df.info()
print("\nFinal dataset shape:", df.shape)



Final dataset info:

<class 'pandas.core.frame.DataFrame'>
Index: 260 entries, 1 to 321
Data columns (total 20 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AtBat      260 non-null    float64
 1   Hits       260 non-null    float64
 2   HmRun      260 non-null    float64
 3   Runs       260 non-null    float64
 4   RBI        260 non-null    float64
 5   Walks      260 non-null    float64
 6   Years      260 non-null    float64
 7   CAtBat     260 non-null    float64
 8   CHits      260 non-null    float64
 9   CHmRun     260 non-null    float64
 10  CRuns      260 non-null    float64
 11  CRBI       260 non-null    float64
 12  CWalks     260 non-null    float64
 13  League     260 non-null    int32  
 14  Division   260 non-null    int32  
 15  PutOuts    260 non-null    float64
 16  Assists    260 non-null    float64
 17  Errors     260 non-null    float64
 18  Salary     260 non-null    float64
 19  NewLeague  260 non-null    int32 

In [29]:
# Step 1: Separate input (X) and output (y) features
X = df.drop('Salary', axis=1)   # All columns except 'Salary'
y = df['Salary']                # Target column

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (260, 19)
Shape of y: (260,)


In [31]:
# Step 2: Perform feature scaling using StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for easier viewing
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

print("\nScaled feature sample:")
print(X_scaled.head())



Scaled feature sample:
      AtBat      Hits     HmRun      Runs       RBI     Walks     Years  \
0 -0.612430 -0.602266 -0.527676 -1.217607 -0.523989 -0.095386  1.400258   
1  0.516825  0.502221  0.731001  0.444634  0.812054  1.626222 -0.898673   
2  0.633882  0.750167  0.959851  0.405057  1.047827 -0.188446  0.773277   
3 -0.571116 -0.467023 -0.184401 -0.623949 -0.366807 -0.514156 -1.107667   
4  1.308680  1.381302 -0.870951  0.761252 -0.013149 -0.281506  0.773277   

     CAtBat     CHits    CHmRun     CRuns      CRBI    CWalks    League  \
0  0.357574  0.186328  0.017320 -0.112325  0.280914  0.444448  1.055377   
1 -0.448079 -0.404087 -0.058152 -0.409049 -0.188027  0.017689 -0.947528   
2  1.319502  1.342168  1.979586  1.438594  1.624367  0.364431  1.055377   
3 -0.990184 -0.960140 -0.699662 -0.947435 -0.885102 -0.858692  1.055377   
4  0.780928  0.651788 -0.611612  0.438298  0.033769 -0.245225 -0.947528   

   Division   PutOuts   Assists    Errors  NewLeague  
0  0.969690  1.2412

In [33]:
# Step 3: Final verification
print("\nFeature matrix (X) shape:", X_scaled.shape)
print("Target vector (y) shape:", y.shape)
print("\nScaled data summary:\n")
print(X_scaled.describe().round(2))



Feature matrix (X) shape: (260, 19)
Target vector (y) shape: (260,)

Scaled data summary:

        AtBat    Hits   HmRun    Runs     RBI   Walks   Years  CAtBat   CHits  \
count  260.00  260.00  260.00  260.00  260.00  260.00  260.00  260.00  260.00   
mean     0.00   -0.00   -0.00    0.00    0.00   -0.00    0.00    0.00    0.00   
std      1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00    1.00   
min     -2.65   -2.34   -1.33   -2.09   -1.90   -1.86   -1.32   -1.16   -1.11   
25%     -0.83   -0.81   -0.76   -0.83   -0.84   -0.84   -0.69   -0.79   -0.79   
50%      0.05   -0.11   -0.30   -0.13   -0.17   -0.19   -0.27   -0.31   -0.32   
75%      0.84    0.75    0.73    0.72    0.77    0.74    0.56    0.53    0.46   
max      1.95    2.94    3.25    2.98    2.74    2.98    3.49    5.04    5.53   

       CHmRun   CRuns    CRBI  CWalks  League  Division  PutOuts  Assists  \
count  260.00  260.00  260.00  260.00  260.00    260.00   260.00   260.00   
mean     0.00    0.00   

In [35]:
# Step 1: Import required libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Step 2: Split dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [37]:
# Step 3: Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred_lin = lin_reg.predict(X_test)
r2_lin = r2_score(y_test, y_pred_lin)
mse_lin = mean_squared_error(y_test, y_pred_lin)

print("ðŸ”¹ Linear Regression Results:")
print(f"RÂ² Score: {r2_lin:.4f}")
print(f"Mean Squared Error: {mse_lin:.4f}")


ðŸ”¹ Linear Regression Results:
RÂ² Score: 0.2849
Mean Squared Error: 0.6784


In [39]:
# Step 4: Ridge Regression (Î± = 0.5748)
ridge_reg = Ridge(alpha=0.5748)
ridge_reg.fit(X_train, y_train)

y_pred_ridge = ridge_reg.predict(X_test)
r2_ridge = r2_score(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

print("\nðŸ”¹ Ridge Regression Results (Î± = 0.5748):")
print(f"RÂ² Score: {r2_ridge:.4f}")
print(f"Mean Squared Error: {mse_ridge:.4f}")



ðŸ”¹ Ridge Regression Results (Î± = 0.5748):
RÂ² Score: 0.3409
Mean Squared Error: 0.6253


In [41]:
# Step 5: Lasso Regression (Î± = 0.5748)
lasso_reg = Lasso(alpha=0.5748)
lasso_reg.fit(X_train, y_train)

y_pred_lasso = lasso_reg.predict(X_test)
r2_lasso = r2_score(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print("\nðŸ”¹ LASSO Regression Results (Î± = 0.5748):")
print(f"RÂ² Score: {r2_lasso:.4f}")
print(f"Mean Squared Error: {mse_lasso:.4f}")



ðŸ”¹ LASSO Regression Results (Î± = 0.5748):
RÂ² Score: 0.0323
Mean Squared Error: 0.9180


In [43]:
# Step 6: Compare all three models
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Ridge Regression', 'Lasso Regression'],
    'R2 Score': [r2_lin, r2_ridge, r2_lasso],
    'MSE': [mse_lin, mse_ridge, mse_lasso]
})

print("\nðŸ“Š Model Comparison:")
print(results)



ðŸ“Š Model Comparison:
               Model  R2 Score       MSE
0  Linear Regression  0.284868  0.678413
1   Ridge Regression  0.340907  0.625252
2   Lasso Regression  0.032306  0.918007


In [45]:
#3
# Step 1: Import required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

In [47]:
# Step 2: Load the Boston Housing dataset
boston = fetch_openml(name="boston", version=1, as_frame=True)
X = boston.data
y = boston.target

print("Dataset shape:", X.shape)
X.head()


  warn(


Dataset shape: (506, 13)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33


In [49]:
# Step 3: Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Feature scaling (important for regularization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [51]:
# Step 5: Define list of alphas to test
alphas = np.logspace(-4, 4, 50)

# Step 6: RidgeCV automatically performs cross-validation to select best alpha
ridge_cv = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_cv.fit(X_train_scaled, y_train)

# Step 7: Evaluate performance
y_pred_ridge = ridge_cv.predict(X_test_scaled)
r2_ridge = r2_score(y_test, y_pred_ridge)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

print("ðŸ”¹ RidgeCV Results:")
print(f"Best Alpha (Î»): {ridge_cv.alpha_}")
print(f"RÂ² Score: {r2_ridge:.4f}")
print(f"Mean Squared Error: {mse_ridge:.4f}")


ðŸ”¹ RidgeCV Results:
Best Alpha (Î»): 7.9060432109076855
RÂ² Score: 0.6665
Mean Squared Error: 24.4565


In [53]:
# Step 8: LassoCV automatically selects best alpha via cross-validation
lasso_cv = LassoCV(alphas=alphas, cv=5, max_iter=10000, random_state=42)
lasso_cv.fit(X_train_scaled, y_train)

# Step 9: Evaluate performance
y_pred_lasso = lasso_cv.predict(X_test_scaled)
r2_lasso = r2_score(y_test, y_pred_lasso)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

print("\nðŸ”¹ LassoCV Results:")
print(f"Best Alpha (Î»): {lasso_cv.alpha_}")
print(f"RÂ² Score: {r2_lasso:.4f}")
print(f"Mean Squared Error: {mse_lasso:.4f}")



ðŸ”¹ LassoCV Results:
Best Alpha (Î»): 0.0001
RÂ² Score: 0.6688
Mean Squared Error: 24.2915


In [55]:
results = pd.DataFrame({
    'Model': ['RidgeCV', 'LassoCV'],
    'Best Alpha': [ridge_cv.alpha_, lasso_cv.alpha_],
    'R2 Score': [r2_ridge, r2_lasso],
    'MSE': [mse_ridge, mse_lasso]
})

print("\nðŸ“Š Cross-Validation Model Comparison:")
print(results)



ðŸ“Š Cross-Validation Model Comparison:
     Model  Best Alpha  R2 Score        MSE
0  RidgeCV    7.906043  0.666504  24.456509
1  LassoCV    0.000100  0.668755  24.291456


In [57]:
#4
# Cell 1 â€” imports & load dataset
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(42)


In [59]:
# Cell 2 â€” prepare data
iris = load_iris()
X = iris.data                # shape (150, 4)
y = iris.target              # 0,1,2

# Standardize features (important for gradient descent)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42, stratify=y
)

print("X_train:", X_train.shape, "X_test:", X_test.shape)


X_train: (112, 4) X_test: (38, 4)


In [61]:
# Cell 3 â€” helper functions for binary logistic regression
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def add_bias(X):
    """Add column of ones for intercept term"""
    return np.c_[np.ones((X.shape[0], 1)), X]


In [63]:
# Cell 4 â€” binary logistic regression trainer (vectorized gradient descent)
def train_logistic_regression(X, y, lr=0.1, n_iters=2000, reg_lambda=0.0, verbose=False):
    """
    X: (m, n) without bias
    y: (m,) binary {0,1}
    returns: weight vector w of shape (n+1,)
    """
    X_b = add_bias(X)           # (m, n+1)
    m, n_plus1 = X_b.shape
    w = np.zeros(n_plus1)       # initialize weights
    
    for i in range(n_iters):
        z = X_b.dot(w)                   # (m,)
        h = sigmoid(z)                   # (m,)
        error = h - y                    # (m,)
        # gradient: (1/m) * X_b.T . error  + (reg_lambda/m) * w_reg
        grad = (X_b.T.dot(error)) / m
        # apply L2 regularization to weights except bias
        reg_term = (reg_lambda / m) * np.r_[0, w[1:]]
        w -= lr * (grad + reg_term)
        
        if verbose and (i % (n_iters // 5) == 0):
            cost = -(1/m) * (y.dot(np.log(h + 1e-15)) + (1 - y).dot(np.log(1 - h + 1e-15)))
            # add regularization term to cost (exclude bias)
            cost += (reg_lambda / (2 * m)) * np.sum(w[1:]**2)
            print(f"iter {i:5d} cost={cost:.6f}")
    return w


In [65]:
# Cell 5 â€” One-vs-Rest training
classes = np.unique(y_train)
ovr_weights = {}   # store weight vector per class

# hyperparameters
learning_rate = 0.1
n_iters = 3000
reg_lambda = 0.01   # small L2 regularization helps numerical stability

for c in classes:
    # create binary labels for class c
    y_binary = (y_train == c).astype(int)
    w_c = train_logistic_regression(X_train, y_binary, lr=learning_rate, n_iters=n_iters, reg_lambda=reg_lambda)
    ovr_weights[c] = w_c

print("Trained OvR classifiers for classes:", list(ovr_weights.keys()))


Trained OvR classifiers for classes: [0, 1, 2]


In [67]:
# Cell 6 â€” prediction using OvR (choose class with highest probability)
def predict_ovr(X, ovr_weights):
    X_b = add_bias(X)   # (m, n+1)
    probs = np.zeros((X.shape[0], len(ovr_weights)))
    class_list = sorted(ovr_weights.keys())
    for idx, c in enumerate(class_list):
        w = ovr_weights[c]
        probs[:, idx] = sigmoid(X_b.dot(w))
    preds = np.argmax(probs, axis=1)
    return preds, probs

y_pred, y_probs = predict_ovr(X_test, ovr_weights)


In [73]:
# Cell 7 â€” evaluation
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=iris.target_names)

print(f"Accuracy (OvR logistic): {acc:.4f}\n")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", report)


Accuracy (OvR logistic): 0.8684

Confusion Matrix:
 [[11  1  0]
 [ 0 10  3]
 [ 0  1 12]]

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      0.92      0.96        12
  versicolor       0.83      0.77      0.80        13
   virginica       0.80      0.92      0.86        13

    accuracy                           0.87        38
   macro avg       0.88      0.87      0.87        38
weighted avg       0.87      0.87      0.87        38



In [75]:
# Cell 8 â€” compare with sklearn's LogisticRegression (for sanity check)
from sklearn.linear_model import LogisticRegression

sk_lr = LogisticRegression(multi_class='ovr', solver='lbfgs', C=1.0, max_iter=500)
sk_lr.fit(X_train, y_train)
sk_pred = sk_lr.predict(X_test)
print("Sklearn Logistic (OvR) accuracy:", accuracy_score(y_test, sk_pred))


Sklearn Logistic (OvR) accuracy: 0.8157894736842105
