In [None]:
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold


In [None]:
diabetes_data = pd.read_csv('diabetes.csv')

# Define features and target variable
X = diabetes_data.drop(columns=['Outcome'])  # Assuming 'Outcome' is the target variable
y = diabetes_data['Outcome']  # Target variable




In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Train a Random Forest Classifier without variance filtering
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate accuracy
y_pred = model.predict(X_test)
initial_accuracy = accuracy_score(y_test, y_pred)

print(f"Initial accuracy without low variance filter: {initial_accuracy:.4f}")


Initial accuracy without low variance filter: 0.7208


In [None]:
# Apply a low variance filter
low_variance_filter = VarianceThreshold(threshold=0.3)  # You can adjust the threshold
X_train_filtered = low_variance_filter.fit_transform(X_train)
X_test_filtered = low_variance_filter.transform(X_test)


In [None]:
# Train the model with filtered features
model_filtered = RandomForestClassifier(random_state=42)
model_filtered.fit(X_train_filtered, y_train)

# Make predictions and evaluate accuracy
y_pred_filtered = model_filtered.predict(X_test_filtered)
filtered_accuracy = accuracy_score(y_test, y_pred_filtered)

print(f"Accuracy after applying low variance filter: {filtered_accuracy:.4f}")


Accuracy after applying low variance filter: 0.7208


In [None]:
# Compare accuracies
print(f"Accuracy Improvement: {filtered_accuracy - initial_accuracy:.4f}")


Accuracy Improvement: 0.0000


Melbourne


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


In [None]:
# Load the dataset from CSV
melbourne_data = pd.read_csv('melbourne.csv')

# Display the first few rows of the dataset
print(melbourne_data.head())


       Suburb  Rooms Type      Price Method SellerG     Date  Distance  \
0  Abbotsford      2    h        NaN     SS  Jellis   3/9/16       2.5   
1  Abbotsford      2    h  1480000.0      S  Biggin  3/12/16       2.5   
2  Abbotsford      2    h  1035000.0      S  Biggin   4/2/16       2.5   
3  Abbotsford      3    u        NaN     VB  Rounds   4/2/16       2.5   
4  Abbotsford      3    h  1465000.0     SP  Biggin   4/3/17       2.5   

   Postcode  Bedroom2  Bathroom  Car  Landsize  BuildingArea  YearBuilt  \
0    3067.0       2.0       1.0  1.0     126.0           NaN        NaN   
1    3067.0       2.0       1.0  1.0     202.0           NaN        NaN   
2    3067.0       2.0       1.0  0.0     156.0          79.0     1900.0   
3    3067.0       3.0       2.0  1.0       0.0           NaN        NaN   
4    3067.0       3.0       2.0  0.0     134.0         150.0     1900.0   

          CouncilArea  Lattitude  Longtitude             Regionname  \
0  Yarra City Council   -37.8014 

In [None]:
# Encode categorical features using Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for column in melbourne_data.columns:
    if melbourne_data[column].dtype == object:  # Check if the column is of type object (string)
        melbourne_data[column] = label_encoder.fit_transform(melbourne_data[column])

In [None]:
# Impute missing values with mean for numerical columns
imputer = SimpleImputer(strategy='mean')
numerical_columns = melbourne_data.select_dtypes(include=['float64', 'int64']).columns
melbourne_data[numerical_columns] = imputer.fit_transform(melbourne_data[numerical_columns])

# Verify missing values have been handled
print(melbourne_data.isnull().sum())


Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
YearBuilt        0
CouncilArea      0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
dtype: int64


In [None]:
# Define features (X) and target variable (y)
X = melbourne_data.drop(columns=['Price'])  # Assuming 'Price' is the target variable
y = melbourne_data['Price']


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Apply a low variance filter
low_variance_filter = VarianceThreshold(threshold=0.1)  # Adjust the threshold as needed
X_train_filtered = low_variance_filter.fit_transform(X_train)
X_test_filtered = low_variance_filter.transform(X_test)


In [None]:
# Train a Random Forest Regressor without variance filtering (this step can be skipped if you only want to analyze the filtered model)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate performance
y_pred = model.predict(X_test)
initial_mse = mean_squared_error(y_test, y_pred)
initial_r2 = r2_score(y_test, y_pred)

print(f"Initial Mean Squared Error without low variance filter: {initial_mse:.4f}")
print(f"R² Score before applying low variance filter: {initial_r2:.4f}")


Initial Mean Squared Error without low variance filter: 142803471996.8225
R² Score before applying low variance filter: 0.5178


In [None]:
# Train the model with filtered features
model_filtered = RandomForestRegressor(random_state=42)
model_filtered.fit(X_train_filtered, y_train)

# Make predictions and evaluate performance
y_pred_filtered = model_filtered.predict(X_test_filtered)
filtered_mse = mean_squared_error(y_test, y_pred_filtered)

filtered_r2 = r2_score(y_test, y_pred_filtered)

print(f"Mean Squared Error after applying low variance filter (Regression): {filtered_mse:.4f}")
print(f"R² Score after applying low variance filter: {filtered_r2:.4f}")


Mean Squared Error after applying low variance filter (Regression): 146113090625.3388
R² Score after applying low variance filter: 0.5066


In [None]:
# Compare performances
print(f"MSE Improvement: {initial_mse - filtered_mse:.4f}")


MSE Improvement: -3309618628.5163
