In [1]:
import pandas as pd
import numpy as np

class StandardScaler2:
    def __init__(self):
        self.mean_ = None
        self.scale_ = None

    def fit(self, X):
        """
        Compute the mean and standard deviation for scaling.
        """
        self.mean_ = np.mean(X, axis=0)  # Mean of each feature
        self.scale_ = np.std(X, axis=0)  # Standard deviation of each feature
        return self

    def transform(self, X):
        """
        Perform standardization by centering and scaling.
        """
        if self.mean_ is None or self.scale_ is None:
            raise ValueError("The scaler has not been fitted yet. Call `fit` first.")
        return (X - self.mean_) / self.scale_

    def fit_transform(self, X):
        """
        Fit to data, then transform it.
        """
        self.fit(X)
        return self.transform(X)

In [2]:
class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        """
        Train the decision tree regressor.
        """
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        """
        Recursively build the decision tree.
        """
        n_samples, n_features = X.shape
        if n_samples == 0 or (self.max_depth and depth >= self.max_depth):
            return np.mean(y)

        best_split = self._find_best_split(X, y)
        if best_split is None:
            return np.mean(y)

        left_indices = X.iloc[:, best_split['feature']] < best_split['threshold']
        right_indices = X.iloc[:, best_split['feature']] >= best_split['threshold']

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return {
            'feature': best_split['feature'],
            'threshold': best_split['threshold'],
            'left': left_tree,
            'right': right_tree
        }

    def _find_best_split(self, X, y):
        """
        Find the best feature and threshold to split the data.
        """
        n_samples, n_features = X.shape
        best_mse = float('inf')
        best_split = None

        for feature in range(n_features):
            thresholds = np.unique(X.iloc[:, feature])
            for threshold in thresholds:
                left_indices = X.iloc[:, feature] < threshold
                right_indices = X.iloc[:, feature] >= threshold

                if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                    continue

                mse = self._calculate_mse(y, left_indices, right_indices)
                if mse < best_mse:
                    best_mse = mse
                    best_split = {'feature': feature, 'threshold': threshold}

        return best_split

    def _calculate_mse(self, y, left_indices, right_indices):
        """
        Calculate the mean squared error for a split.
        """
        left_mean = np.mean(y[left_indices])
        right_mean = np.mean(y[right_indices])
        left_mse = np.mean((y[left_indices] - left_mean) ** 2)
        right_mse = np.mean((y[right_indices] - right_mean) ** 2)
        return left_mse * len(y[left_indices]) + right_mse * len(y[right_indices])

    def predict(self, X):
        """
        Predict target values for samples in X.
        """
        if self.tree is None:
            raise ValueError("Model is not trained. Call `fit` first.")

        # Ensure X is a DataFrame or convert it to a suitable format
        if isinstance(X, pd.DataFrame):
            # Convert each row of the DataFrame to a numpy array
            return np.array([self._predict_sample(row, self.tree) for row in X.to_numpy()])
        else:
            return np.array([self._predict_sample(sample, self.tree) for sample in X])

    def _predict_sample(self, sample, node):
        """
        Predict the target value for a single sample.
        """
        if isinstance(node, dict):
            feature_index = node['feature']
            if sample[feature_index] < node['threshold']:
                return self._predict_sample(sample, node['left'])
            else:
                return self._predict_sample(sample, node['right'])
        return node

In [3]:
def train_test_split_2(X, y, test_size=0.2, random_state=None):
    import numpy as np

    # Set random state for reproducibility
    if random_state:
        np.random.seed(random_state)

    # Generate shuffled indices
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)

    # Split indices into train and test sets
    test_size = int(len(X) * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]

    # Split the data using slicing
    X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
    y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

    return X_train, X_test, y_train, y_test

In [4]:

import pandas as pd# Load data
data = pd.read_csv("house.csv")

# Prepare features and target
X = data.drop(columns=['date', 'price', 'country'])
y = data['price']

# Encode categorical columns
from sklearn.preprocessing import LabelEncoder
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
X = pd.get_dummies(X, drop_first=True)
# Scale features
sc = StandardScaler2()
X = pd.DataFrame(sc.fit_transform(X), columns=X.columns)

# Split data
X_train, X_test, y_train, y_test = train_test_split_2(X, y, random_state=50)



In [5]:
# Train model
DT_reg = DecisionTreeRegressor(max_depth=2)
DT_reg.fit(X_train, y_train)

In [None]:
# Predict
ypred = DT_reg.predict(X_test)

In [None]:
# Evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2 = r2_score(y_test, ypred)
mae = mean_absolute_error(y_test, ypred)
mse = mean_squared_error(y_test, ypred)

print("R-squared Score:", r2)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)

In [None]:
class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=10):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for i in range(self.n_iterations):
            y_pred = np.dot(X, self.weights) + self.bias
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

            # Print progress (optional)
            if i % 100 == 0:
                mse = np.mean((y_pred - y) ** 2)
                print(f"Iteration {i}, MSE: {mse}, Weights: {self.weights}, Bias: {self.bias}")

    def predict(self, X):
        if self.weights is None or self.bias is None:
            raise ValueError("Model is not trained. Call `fit` first.")
        return np.dot(X, self.weights) + self.bias

In [None]:
LR=LinearRegression()

In [None]:
LR.fit(X_train,y_train)

In [None]:
ypred_LR=LR.predict(X_test)

In [None]:
# Evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_LR = r2_score(y_test, ypred_LR)
mae_LR = mean_absolute_error(y_test, ypred_LR)
mse_LR= mean_squared_error(y_test, ypred_LR)

print("R-squared Score:", r2_LR)
print("Mean Absolute Error:", mae_LR)
print("Mean Squared Error:", mse_LR)

In [None]:
import numpy as np
import pandas as pd

class KNNRegressor:
    def __init__(self, k=5):  # Corrected __init__ method
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        """
        Store the training data.
        """
        # Ensure the input training data is numeric
        if isinstance(X_train, pd.DataFrame):
            # Check if all columns in the DataFrame are numeric
            if not all([np.issubdtype(dtype, np.number) for dtype in X_train.dtypes]):
                raise ValueError("Training data contains non-numeric values.")
            # Convert DataFrame to NumPy array for processing
            X_train = X_train.values
        elif isinstance(X_train, np.ndarray):
            # Ensure the NumPy array is numeric
            if not np.issubdtype(X_train.dtype, np.number):
                raise ValueError("Training data contains non-numeric values.")
        else:
            raise TypeError("X_train must be either a Pandas DataFrame or a NumPy array.")

        # Ensure the target data is numeric
        if isinstance(y_train, pd.Series):
            if not np.issubdtype(y_train.dtype, np.number):
                raise ValueError("Target data contains non-numeric values.")
            y_train = y_train.values
        elif isinstance(y_train, np.ndarray):
            if not np.issubdtype(y_train.dtype, np.number):
                raise ValueError("Target data contains non-numeric values.")
        else:
            raise TypeError("y_train must be either a Pandas Series or a NumPy array.")

        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        """
        Predict the target values for the test data.
        """
        # Check if X_test is a Pandas DataFrame
        if isinstance(X_test, pd.DataFrame):
            # Ensure all columns in the DataFrame are numeric
            if not all([np.issubdtype(dtype, np.number) for dtype in X_test.dtypes]):
                raise ValueError("Test data contains non-numeric values.")
            # Convert DataFrame to NumPy array for processing
            X_test = X_test.values
        elif isinstance(X_test, np.ndarray):
            # Ensure the NumPy array is numeric
            if not np.issubdtype(X_test.dtype, np.number):
                raise ValueError("Test data contains non-numeric values.")
        else:
            raise TypeError("X_test must be either a Pandas DataFrame or a NumPy array.")

        # Predict using the processed X_test
        y_pred = [self._predict(x) for x in X_test]
        return np.array(y_pred)

    def _predict(self, x):
        """
        Predict the target value for a single query point.
        """
        # Calculate distances between the query point and all training points
        distances = [self._euclidean_distance(x, x_train) for x_train in self.X_train]

        # Get the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]

        # Get the target values of the k nearest neighbors
        k_nearest_targets = [self.y_train[i] for i in k_indices]

        # Return the average of the k nearest neighbors' target values
        return np.mean(k_nearest_targets)

    def _euclidean_distance(self, x1, x2):
        """
        Calculate the Euclidean distance between two points.
        """
        return np.sqrt(np.sum((x1 - x2) ** 2))

In [None]:
knn=KNNRegressor()

In [None]:
X_train = X_train.fillna(0)

In [None]:
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [None]:
knn.fit(X_train,y_train)

In [None]:
X_test

In [None]:
X_test = X_test.fillna(0)

In [None]:
X_test

In [None]:
ypred_knn=knn.predict(X_test)

In [None]:
# Evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_knn = r2_score(y_test, ypred_knn)
mae_knn = mean_absolute_error(y_test, ypred_knn)
mse_knn= mean_squared_error(y_test, ypred_knn)

print("R-squared Score:", r2_knn)
print("Mean Absolute Error:", mae_knn)
print("Mean Squared Error:", mse_knn)

In [23]:
import numpy as np
import pandas as pd
from scipy.stats import norm

class GaussianNaiveBayesRegressor:
    def __init__(self, epsilon=1e-9, min_prob=1e-9):
        self.mean_ = None  # Mean of each feature for each target value
        self.var_ = None   # Variance of each feature for each target value
        self.target_values_ = None  # Unique target values
        self.epsilon = epsilon  # Small constant to avoid division by zero
        self.min_prob = min_prob  # Minimum probability to avoid log(0)

    def fit(self, X, y):
        """
        Fit the Gaussian Naive Bayes model to the training data.
        """
        # Convert inputs to NumPy arrays if they are pandas DataFrames/Series
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, pd.Series):
            y = y.values

        # Get unique target values
        self.target_values_ = np.unique(y)

        # Initialize arrays to store mean and variance for each feature and target value
        n_targets = len(self.target_values_)
        n_features = X.shape[1]
        self.mean_ = np.zeros((n_targets, n_features))
        self.var_ = np.zeros((n_targets, n_features))

        # Calculate mean and variance for each target value
        for i, target in enumerate(self.target_values_):
            X_target = X[y == target]  # Filter data for the current target value
            self.mean_[i, :] = X_target.mean(axis=0)  # Mean of each feature
            self.var_[i, :] = X_target.var(axis=0)    # Variance of each feature

            # Add epsilon to avoid zero variance
            self.var_[i, :] += self.epsilon

    def predict(self, X):
        """
        Predict the target values for the test data.
        """
        # Convert input to NumPy array if it is a pandas DataFrame
        if isinstance(X, pd.DataFrame):
            X = X.values

        # Initialize array to store predictions
        y_pred = np.zeros(X.shape[0])

        # Predict for each sample in X
        for i, x in enumerate(X):
            likelihoods = []

            # Calculate likelihood for each target value
            for j, target in enumerate(self.target_values_):
                # Calculate Gaussian PDF and clip values to avoid log(0)
                pdf_values = norm.pdf(x, self.mean_[j, :], np.sqrt(self.var_[j, :]))
                pdf_values = np.clip(pdf_values, self.min_prob, None)  # Clip to avoid extremely small values
                likelihood = np.sum(np.log(pdf_values))  # Log likelihood
                likelihoods.append(likelihood)

            # Assign the target value with the highest likelihood
            y_pred[i] = self.target_values_[np.argmax(likelihoods)]

        return y_pred

In [24]:
NB=GaussianNaiveBayesRegressor()

In [25]:
NB.fit(X_train,y_train)

In [None]:
ypred_NB=NB.predict(X_test)

In [None]:
# Evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_NB = r2_score(y_test, ypred_NB)
mae_NB = mean_absolute_error(y_test, ypred_NB)
mse_NB= mean_squared_error(y_test, ypred_NB)

print("R-squared Score:", r2_NB)
print("Mean Absolute Error:", mae_NB)
print("Mean Squared Error:", mse_NB)

In [None]:
import numpy as np

class SVMRegressor:
    def __init__(self, learning_rate=0.01, lambda_param=0.01, epsilon=0.1, n_iters=1000):
        """
        Initialize the SVM Regressor.

        Parameters:
        - learning_rate: Step size for gradient descent.
        - lambda_param: Regularization parameter.
        - epsilon: Threshold for epsilon-insensitive loss.
        - n_iters: Number of iterations for training.
        """
        self.learning_rate = learning_rate
        self.lambda_param = lambda_param
        self.epsilon = epsilon
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        """
        Train the SVM Regressor using Stochastic Gradient Descent (SGD).

        Parameters:
        - X: Training features (n_samples, n_features). Can be a NumPy array or pandas DataFrame.
        - y: Target values (n_samples,). Can be a NumPy array or pandas Series.
        """
        # Convert inputs to NumPy arrays if they are pandas DataFrames/Series
        if isinstance(X, pd.DataFrame):
            X = X.values
        if isinstance(y, pd.Series):
            y = y.values

        n_samples, n_features = X.shape

        # Initialize weights and bias
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Stochastic Gradient Descent
        for _ in range(self.n_iters):
            for i in range(n_samples):
                # Compute the predicted value
                y_pred = np.dot(X[i], self.weights) + self.bias

                # Compute the error
                error = y[i] - y_pred

                # Update weights and bias based on the epsilon-insensitive loss
                if abs(error) <= self.epsilon:
                    self.weights -= self.learning_rate * (2 * self.lambda_param * self.weights)
                else:
                    self.weights -= self.learning_rate * (
                        2 * self.lambda_param * self.weights - np.sign(error) * X[i]
                    )
                    self.bias -= self.learning_rate * np.sign(error)

    def predict(self, X):
        """
        Predict the target values for the test data.

        Parameters:
        - X: Test features (n_samples, n_features). Can be a NumPy array or pandas DataFrame.

        Returns:
        - Predicted values (n_samples,).
        """
        # Convert input to NumPy array if it is a pandas DataFrame
        if isinstance(X, pd.DataFrame):
            X = X.values

        return np.dot(X, self.weights) + self.bias

In [None]:
svm=SVMRegressor()

In [None]:
svm.fit(X_train,y_train)

In [None]:
ypred_svm=svm.predict(X_test)

In [None]:
# Evaluate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r2_svm = r2_score(y_test, ypred_svm)
mae_svm = mean_absolute_error(y_test, ypred_svm)
mse_svm= mean_squared_error(y_test, ypred_svm)

print("R-squared Score:", r2_svm)
print("Mean Absolute Error:", mae_svm)
print("Mean Squared Error:", mse_svm)