In [2]:
import numpy as np
import csv
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# For Regression

file_path = "HousingData.csv"

# Initialize an empty list to store the data
data = []

# Open the CSV file and read its contents
with open(file_path, newline='') as csvfile:
    reader = csv.reader(csvfile)
    
    # Skip the header row if it exists
    header = next(reader, None)
    
    # Read the data row by row and append to the list
    for row in reader:
        data.append(row)
# data.pop(0)
# print(data[0])

dataset = np.array(data)
print(dataset.shape)

y = dataset[:, -1]  # Extract the last column
X = dataset[:, :-1]  # Remove the last column

print(X.shape)
print(y.shape)

(506, 14)
(506, 13)
(506,)


In [4]:
for i in range(X.shape[0]):
    for j in range(X.shape[1]):
        if X[i][j] == 'NA':
            X[i][j] = 0
X = X.astype(float)

In [5]:
scaler1 = MinMaxScaler()
scaler2 = StandardScaler()
print(X[1])
# Fit and transform the imputer on your data
X = scaler1.fit_transform(X)
X = scaler2.fit_transform(X)
print(X[1])

y = y.astype(float)
print(y[0])

[2.7310e-02 0.0000e+00 7.0700e+00 0.0000e+00 4.6900e-01 6.4210e+00
 7.8900e+01 4.9671e+00 2.0000e+00 2.4200e+02 1.7800e+01 3.9690e+02
 9.1400e+00]
[-0.40178501 -0.46815955 -0.50849056 -0.26839132 -0.74026221  0.19427445
  0.42930593  0.55715988 -0.8678825  -0.98732948 -0.30309415  0.44105193
 -0.41351877]
24.0


In [46]:
# Split the data into train, validation, and test sets
X_train_r, X_temp, y_train_r, y_temp = train_test_split(X, y, test_size=0.2)

# Further split the remaining data into validation and test sets
X_valid_r, X_test_r, y_valid_r, y_test_r = train_test_split(X_temp, y_temp, test_size=0.5)

In [70]:
## training and testing accuracy of the model

decisiontreeclassifier = DecisionTreeRegressor()
decisiontreeclassifier.fit(X_train_r,y_train_r)

y_pred = decisiontreeclassifier.predict(X_test_r)

mse_DT = mean_squared_error(y_test_r, y_pred)
print(f"Mean Squared Error Test: {mse_DT:.2f}")

Mean Squared Error Test: 16.01


In [96]:
### Random Forest Regressor class

class RandomForestRegressor():
    def __init__(self,n_estimators,sample_fraction,bootstrap,voting):
        self.n_estimators = n_estimators
        self.sample_fraction = sample_fraction
        self.bootstrap = bootstrap
        self.voting = voting
    
    def fit(self,X,y):
        self.forest = []
        num_samples = X.shape[0]
        self.selected_features = []

        if self.bootstrap:
            max_num_of_estimators = int(1/self.sample_fraction)
            if self.n_estimators > max_num_of_estimators:
                self.n_estimators = max_num_of_estimators

        for i in range(self.n_estimators):
            if self.bootstrap:
                sample_indices = np.random.choice(num_samples, size=int(self.sample_fraction * num_samples), replace=True)
            else:
                sample_indices = np.random.choice(num_samples, size=int(self.sample_fraction * num_samples), replace=False)

            X_sampled = X[sample_indices]
            y_sampled = y[sample_indices]

            # Randomly select K features
            selected_features = np.random.choice(X.shape[1], size=13, replace=False)
            X_sampled = X_sampled[:,selected_features]
            self.selected_features.append(selected_features)

            model = DecisionTreeRegressor()
            model.fit(X_sampled, y_sampled)
            self.forest.append(model)
    
    def get_selected_features(self):
        return [tree.tree_.feature for tree in self.forest]

    def predict(self, X, X_val, y_val):
        # predictions = np.array([tree.predict(X[:, selected_features]) for tree, selected_features in zip(self.forest, self.get_selected_features())])
        
        predictions = []
        for i in range(self.n_estimators):
            predictions.append(self.forest[i].predict(X[:,self.selected_features[i]]))
        predictions = np.array(predictions)


        # Finding confidence value for each model using X_val and y_val
        y_preds_val = []
        for i in range(self.n_estimators):
            y_preds_val.append(self.forest[i].predict(X_val[:,self.selected_features[i]]))
        y_preds_val = np.array(y_preds_val)
        y_preds_val = np.reshape(y_preds_val,(len(self.forest),X_val.shape[0]))
        mses = np.mean((y_preds_val - y_val)**2, axis=1)
        self.confidence = 1 / mses
        normalizer = np.sum(self.confidence)

        if self.voting == 'hard':
            predictions_final = np.round(predictions.mean(axis=0))
        elif self.voting == 'soft':
            predictions = np.reshape(predictions,(X.shape[0],len(self.forest)))
            predictions_final = np.sum(predictions * self.confidence, axis=1) / normalizer

        return predictions_final


In [105]:
import time
n_estimators = [5,10,15,20]
sampling_fraction = [0.05,0.10,0.15, 0.25, 0.5, 0.75, 1.0]
bootstrap = [True,False]
voting = ['hard','soft']

mse_results = []

for i in range(1):
    for n_est in n_estimators:
        for s_frac in sampling_fraction:
            for bootstrp in bootstrap:
                for votin_mech in voting:
                    ensemble_model = RandomForestRegressor(n_est,s_frac,bootstrap,votin_mech)
                    start_time = time.time()
                    ensemble_model.fit(X_train_r,y_train_r)
                    end_time = time.time()
                    predictions = ensemble_model.predict(X_test_r,X_valid_r,y_valid_r)
                    mse = np.mean((y_test_r-predictions)**2)
                    mse_results.append({
                        'Number of Estimators': n_est,
                        'Sampling Fraction': s_frac,
                        'Bootstrap': bootstrp,
                        'Voting Mechanism': votin_mech,
                        'MSE': mse,
                        'Training Time':end_time - start_time
                    })

In [106]:
# Sort the results based on MSE and print the top three
sorted_results = sorted(mse_results, key=lambda x: x['MSE'])
top_three_results = sorted_results[:3]

print("Top Three Best-Performing Models:")
for i, result in enumerate(top_three_results, start=1):
    print(f"{i}. MSE: {result['MSE']:.4f} - {result}")

Top Three Best-Performing Models:
1. MSE: 6.9731 - {'Number of Estimators': 5, 'Sampling Fraction': 0.25, 'Bootstrap': False, 'Voting Mechanism': 'hard', 'MSE': 6.9731372549019595, 'Training Time': 0.002171039581298828}
2. MSE: 7.5627 - {'Number of Estimators': 10, 'Sampling Fraction': 1.0, 'Bootstrap': False, 'Voting Mechanism': 'soft', 'MSE': 7.562745098039212, 'Training Time': 0.0014896392822265625}
3. MSE: 8.0673 - {'Number of Estimators': 5, 'Sampling Fraction': 0.5, 'Bootstrap': True, 'Voting Mechanism': 'hard', 'MSE': 8.067254901960785, 'Training Time': 0.0014524459838867188}


In [107]:
# For Classification

file_path = 'WineQT.csv'

# Initialize an empty list to store the data
data = []

# Open the CSV file and read its contents
with open(file_path, newline='') as csvfile:
    reader = csv.reader(csvfile)
    
    # Skip the header row if it exists
    header = next(reader, None)
    
    # Read the data row by row and append to the list
    for row in reader:
        data.append(row)

dataset = np.array(data)
print(dataset.shape)
dataset = dataset[:,:-1]
print(dataset[0])
# print(dataset[0])

X = dataset[:,:-1]
y = dataset[:,-1]
y = np.reshape(y,(1143,1))
X = X.astype('float64')
y = y.astype('float64')
print(X.shape)
print(y.shape)
print("==============")
print("Input: ",X[0])
print("Output: ",y[0])
print("==============")

(1143, 13)
['7.4' '0.7' '0.0' '1.9' '0.076' '11.0' '34.0' '0.9978' '3.51' '0.56'
 '9.4' '5']
(1143, 11)
(1143, 1)
Input:  [ 7.4     0.7     0.      1.9     0.076  11.     34.      0.9978  3.51
  0.56    9.4   ]
Output:  [5.]


In [108]:
for i in range(y.shape[0]):
    if (y[i] == 3 or y[i] == 4 or y[i] == 5):
        y[i] = 0
    else:
        y[i] = 1

In [109]:
unique_labels = np.unique(y)
print(unique_labels)

label_binarizer = LabelBinarizer()

# Fit and transform the labels
one_hot_encoded_labels = label_binarizer.fit_transform(y)
print(y[0])
print(one_hot_encoded_labels[0])

[0. 1.]
[0.]
[0]


In [111]:
imputer = SimpleImputer(strategy="mean")
scaler1 = MinMaxScaler()
scaler2 = StandardScaler()
print(X[1])
# Fit and transform the imputer on your data
X= imputer.fit_transform(X)
X = scaler1.fit_transform(X)
X = scaler2.fit_transform(X)
print(X[1])

[ 7.8     0.88    0.      2.6     0.098  25.     67.      0.9968  3.2
  0.68    9.8   ]
[-0.29259344  1.94181282 -1.36502663  0.05006018  0.23424656  0.91591972
  0.64347653  0.03616459 -0.70892755  0.1308811  -0.59360107]


In [113]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2)

# Further split the remaining data into validation and test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5)
print(y_train[0])

[0.]


In [159]:
## training and testing accuracy of the model

decisiontreeclassifier = DecisionTreeClassifier()
decisiontreeclassifier.fit(X_train,y_train)

y_pred = decisiontreeclassifier.predict(X_test)

accuracy_DT = accuracy_score(y_test, y_pred)
print(f"Accuracy Test: {accuracy_DT:.2f}")

y_pred = decisiontreeclassifier.predict(X_valid)

accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy Valid: {accuracy:.2f}")

pred_prob = decisiontreeclassifier.predict_proba(X_valid)
print(pred_prob.shape)

Accuracy Test: 0.75
Accuracy Valid: 0.73
(114, 2)


In [177]:
# Random Forest Classifier class

### Random Forest Regressor class

class RandomForestClassifier():
    def __init__(self,n_estimators,sample_fraction,bootstrap,voting):
        self.n_estimators = n_estimators
        self.sample_fraction = sample_fraction
        self.bootstrap = bootstrap
        self.voting = voting

    def fit(self,X,y):
        self.forest = []
        num_samples = X.shape[0]
        self.selected_features = []

        if self.bootstrap:
            max_num_of_estimators = int(1/self.sample_fraction)
            if self.n_estimators > max_num_of_estimators:
                self.n_estimators = max_num_of_estimators

        for i in range(self.n_estimators):
            if self.bootstrap:
                sample_indices = np.random.choice(num_samples, size=int(self.sample_fraction * num_samples), replace=True)
            else:
                sample_indices = np.random.choice(num_samples, size=int(self.sample_fraction * num_samples), replace=False)

            X_sampled = X[sample_indices]
            y_sampled = y[sample_indices]

            # Randomly select K features
            selected_features = np.random.choice(X.shape[1], size=11, replace=False)
            X_sampled = X_sampled[:,selected_features]
            self.selected_features.append(selected_features)

            model = DecisionTreeClassifier()
            model.fit(X_sampled, y_sampled)
            self.forest.append(model)
    
    def get_selected_features(self):
        return [tree.tree_.feature for tree in self.forest]
    
    def most_frequent_number(self,row):
        return np.argmax(np.bincount(row))

    def predict(self, X):
        # predictions = np.array([tree.predict(X[:, selected_features]) for tree, selected_features in zip(self.forest, self.get_selected_features())])
        
        predictions = []
        for i in range(self.n_estimators):
            predictions.append(self.forest[i].predict(X[:,self.selected_features[i]]))
        predictions = np.array(predictions)
        predictions = predictions.astype(int)

        if self.voting == 'hard':
            predictions = np.reshape(predictions,(X.shape[0],len(self.forest)))
            predictions_final = np.apply_along_axis(self.most_frequent_number, axis=1, arr=predictions)
        elif self.voting == 'soft':
            predictions_prob = []
            for model in self.forest:
                predictions_prob.append(model.predict_proba(X))
            predictions_prob = np.array(predictions_prob)
            predictions_prob = np.reshape(predictions_prob,(predictions_prob.shape[1],predictions_prob.shape[0],predictions_prob.shape[2]))
            predictions = np.argmax(np.sum(predictions_prob,axis=1),axis=1)
            return predictions

        return predictions_final

In [180]:
n_estimators = [5,10,15,20]
sampling_fraction = [0.05,0.10,0.15, 0.25, 0.5, 0.75, 1.0]
bootstrap = [True,False]
voting = ['hard','soft']

acc_results = []

for i in range(1):
    for n_est in n_estimators:
        for s_frac in sampling_fraction:
            for bootstrp in bootstrap:
                for votin_mech in voting:
                    ensemble_model = RandomForestClassifier(n_est,s_frac,bootstrap,votin_mech)
                    start_time = time.time()
                    ensemble_model.fit(X_train,y_train)
                    end_time = time.time()
                    predictions = ensemble_model.predict(X_test)
                    accuracy = accuracy_score(predictions,y_test)
                    acc_results.append({
                        'Number of Estimators': n_est,
                        'Sampling Fraction': s_frac,
                        'Bootstrap': bootstrp,
                        'Voting Mechanism': votin_mech,
                        'Accuracy': accuracy,
                        'Training time': end_time - start_time
                    })

In [181]:
# Sort the results based on MSE and print the top three
sorted_results = sorted(acc_results, key=lambda x: x['Accuracy'],reverse=True)
top_three_results = sorted_results[:3]

print("Top Three Best-Performing Models:")
for i, result in enumerate(top_three_results, start=1):
    print(f"{i}. Accuracy: {result['Accuracy']:.4f} - {result}")

Top Three Best-Performing Models:
1. Accuracy: 0.8000 - {'Number of Estimators': 15, 'Sampling Fraction': 0.75, 'Bootstrap': False, 'Voting Mechanism': 'hard', 'Accuracy': 0.8, 'Training time': 0.002920389175415039}
2. Accuracy: 0.7565 - {'Number of Estimators': 5, 'Sampling Fraction': 1.0, 'Bootstrap': False, 'Voting Mechanism': 'hard', 'Accuracy': 0.7565217391304347, 'Training time': 0.002920866012573242}
3. Accuracy: 0.7217 - {'Number of Estimators': 10, 'Sampling Fraction': 1.0, 'Bootstrap': True, 'Voting Mechanism': 'hard', 'Accuracy': 0.7217391304347827, 'Training time': 0.003142833709716797}


# Boosted Trees