## Question 1

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import mode


iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
n_trees = 10
predictions = []

for i in range(n_trees):
    n_samples = int(0.5 * X_train.shape[0]) # bootstrap sample size , column
    sample_indices = np.random.choice(X_train.shape[0], n_samples, replace=True)
    X_sample = X_train[sample_indices]
    y_sample = y_train[sample_indices]

    n_features = int(0.5 * X_train.shape[1])# number of features to select
    feature_indices = np.random.choice(X_train.shape[1], n_features, replace=False)
    X_sample_subset = X_sample[:, feature_indices]
    X_test_subset = X_test[:, feature_indices]

    tree = DecisionTreeClassifier()
    tree.fit(X_sample_subset, y_sample)

    y_pred = tree.predict(X_test_subset)
    predictions.append(y_pred)

    tree_acc = accuracy_score(y_test, y_pred)
    print("Tree,", i+1, "Accuracy:", tree_acc)

predictions = np.array(predictions)
final_predictions, _ = mode(predictions, axis=0)
final_predictions = final_predictions.flatten()

custom_accuracy = accuracy_score(y_test, final_predictions)
print('Custom Random Forest Accuracy:', custom_accuracy)


# Comparing with sklearn's RandomForestClassifier
rf = RandomForestClassifier(n_estimators=n_trees, random_state=42)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f'Sklearn Random Forest Accuracy: ',rf_accuracy)


Tree, 1 Accuracy: 0.9777777777777777
Tree, 2 Accuracy: 0.9777777777777777
Tree, 3 Accuracy: 0.9333333333333333
Tree, 4 Accuracy: 0.9777777777777777
Tree, 5 Accuracy: 1.0
Tree, 6 Accuracy: 1.0
Tree, 7 Accuracy: 1.0
Tree, 8 Accuracy: 0.9777777777777777
Tree, 9 Accuracy: 0.9111111111111111
Tree, 10 Accuracy: 0.9333333333333333
Custom Random Forest Accuracy: 1.0
Sklearn Random Forest Accuracy:  1.0


In [7]:
print(X_train.shape)

(105, 4)


## Question 2

In [24]:
import seaborn as sns
import pandas as pd

titanic = sns.load_dataset("titanic")
titanic.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [25]:
# check missing values and data types
titanic.info()
# print("\n\nMissing Values per Column:\n", titanic.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [26]:
# Handle missing values (for simplicity, we'll drop rows with missing values)
titanic = titanic.drop((["deck"]),axis=1)
titanic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  889 non-null    object  
 12  alive        891 non-null    object  
 13  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 79.4+ KB


In [None]:
# encoding
titanic["sex"] = titanic["sex"].map({"male": 0, "female": 1})
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,0,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,1,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,1,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,1,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,0,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


In [43]:
X = titanic[["pclass", "sex", "age", "fare"]]
y= titanic["survived"]
X = X.dropna()
y = y.loc[X.index]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape)
print(X_test.shape)
X

(499, 4)
(215, 4)


Unnamed: 0,pclass,sex,age,fare
0,3,0,22.0,7.2500
1,1,1,38.0,71.2833
2,3,1,26.0,7.9250
3,1,1,35.0,53.1000
4,3,0,35.0,8.0500
...,...,...,...,...
885,3,1,39.0,29.1250
886,2,0,27.0,13.0000
887,1,1,19.0,30.0000
889,1,0,26.0,30.0000


In [52]:
# 5. Train a Decision Tree
# Train a DecisionTreeClassifier using the training set (default parameters).
# Print only the accuracy on the test set.
# Note the accuracy result.
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_predictions = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print(f'Decision Tree Accuracy: ',dt_accuracy)

Decision Tree Accuracy:  0.7209302325581395


In [53]:
# 6. Train a Random Forest
# Train a RandomForestClassifier using the training set (default parameters).
# Print only the accuracy on the test set.
rf = RandomForestClassifier(n_estimators=100,random_state=42)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f'Random Forest Accuracy: ',rf_accuracy)


Random Forest Accuracy:  0.786046511627907


In [None]:
seeds = [0, 10, 20, 30, 40]
dt_accuracies = []
rf_accuracies = []
for s in seeds:
    dt = DecisionTreeClassifier(random_state=s)
    dt.fit(X_train, y_train)
    dt_predictions = dt.predict(X_test)
    dt_accuracy = accuracy_score(y_test, dt_predictions)
    dt_accuracies.append(dt_accuracy)

    rf = RandomForestClassifier(n_estimators=100, random_state=s)
    rf.fit(X_train, y_train)
    rf_predictions = rf.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_predictions)
    rf_accuracies.append(rf_accuracy)
print("Decision Tree Accuracies:", dt_accuracies)
print("Random Forest Accuracies:", rf_accuracies)


Decision Tree Accuracies: [0.7441860465116279, 0.7302325581395349, 0.7302325581395349, 0.7209302325581395, 0.7348837209302326]
Random Forest Accuracies: [0.786046511627907, 0.7767441860465116, 0.7813953488372093, 0.7906976744186046, 0.7906976744186046]


## Question 3

In [1]:
import pandas as pd
import numpy as np

# Given dataset
data = {
    'Experience': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11, 12, 13, 14, 15, 16, 17, 18, 19, 20,21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
    'Test_Score': [50, 60, 65, 70, 75, 80, 82, np.nan, 87, 90,92, 94, 95, 96, 98, 99, 100, 102, 103, 105,107, 108, 110, 111, 112, np.nan, 114, 115, 116, 118],
    'Salary': [40, 0, 45, 47, np.nan, 106, 61, 6, 65, 72,37, 75, 99, np.nan, 82, 90, 85, 88, 89, 11,94, 98, 99, 10, 102, 133, 117, 105, 16, 108]
}

df = pd.DataFrame(data)

print("Original dataset with noise & missing values:")
print(df.head(10))


Original dataset with noise & missing values:
   Experience  Test_Score  Salary
0           1        50.0    40.0
1           2        60.0     0.0
2           3        65.0    45.0
3           4        70.0    47.0
4           5        75.0     NaN
5           6        80.0   106.0
6           7        82.0    61.0
7           8         NaN     6.0
8           9        87.0    65.0
9          10        90.0    72.0


In [3]:
df.columns.tolist()

['Experience', 'Test_Score', 'Salary']

In [62]:
print("Column Names:", df.columns.tolist())
print("\nMissing Values per Column:")
print(df.isnull().sum())

Column Names: ['Experience', 'Test_Score', 'Salary']

Missing Values per Column:
Experience    0
Test_Score    2
Salary        2
dtype: int64


In [76]:
df = df.dropna(subset=["Test_Score", "Salary"], how="all")

# Fill remaining missing numeric values with column mean
df["Test_Score"].fillna(df["Test_Score"].mean(), inplace=True)
df["Salary"].fillna(df["Salary"].mean(), inplace=True)

print("After partial cleaning:")
print(df.head(10))


After partial cleaning:
   Experience  Test_Score      Salary
0           1   50.000000   40.000000
1           2   60.000000    0.000000
2           3   65.000000   45.000000
3           4   70.000000   47.000000
4           5   75.000000   70.714286
5           6   80.000000  106.000000
6           7   82.000000   61.000000
7           8   94.785714    6.000000
8           9   87.000000   65.000000
9          10   90.000000   72.000000


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Test_Score"].fillna(df["Test_Score"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Salary"].fillna(df["Salary"].mean(), inplace=True)


In [77]:
from sklearn.model_selection import train_test_split

# Select features and target
X = df[["Experience", "Test_Score"]]
y = df["Salary"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train, y_train)

y_pred_dt = dt_regressor.predict(X_test)

mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

print("Decision Tree Regressor Results:")
print("  MSE:",mse_dt)
print("  R²:",  r2_dt)


Decision Tree Regressor Results:
  MSE: 2730.4444444444443
  R²: -1.379613092035893


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

y_pred_rf = rf_regressor.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Regressor Results:")
print("  MSE:",mse_rf)
print("  R²:",  r2_rf)


Random Forest Regressor Results:
  MSE: 2267.1841310657596
  R²: -0.9758765102572193
