<a href="https://colab.research.google.com/github/mohanasamanya/MachineLearning/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the data
housing_data = pd.read_csv('/content/Melbourne_housing_FULL.csv')

# Calculate missing percentage and remove columns with more than 20% missing values, except 'Price'
missing_percentage = housing_data.isnull().mean() * 100
columns_to_remove = [column for column in missing_percentage.index if missing_percentage[column] > 20 and column != 'Price']
cleaned_data = housing_data.drop(columns=columns_to_remove)

# Drop rows with missing 'Price' values
cleaned_data = cleaned_data.dropna(subset=['Price'])

# Select features and target
features = cleaned_data.drop(columns=['Price', 'Date', 'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname', 'Address'])
target = cleaned_data['Price']

# Select numeric features and fill missing values with the mean
numeric_features = features.select_dtypes(include=[float, int])
features[numeric_features.columns] = numeric_features.fillna(numeric_features.mean())

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Define the model evaluation function
def evaluate_model(train_features, test_features, train_target, test_target):
    model = RandomForestRegressor(random_state=42)
    model.fit(train_features, train_target)
    predictions = model.predict(test_features)
    mse = mean_squared_error(test_target, predictions)
    return mse

# Evaluate the model and print the MSE
mse_value = evaluate_model(X_train, X_test, y_train, y_test)
print(f"Model performance after filtering columns with 20% missing values: MSE = {mse_value:.2f}")

Model performance after filtering columns with 20% missing values: MSE = 143875373039.63


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the data
housing_data = pd.read_csv('/content/Melbourne_housing_FULL.csv')

# Calculate missing percentage and remove columns with more than 20% missing values, except 'Price'
missing_percentage = housing_data.isnull().mean() * 100
columns_to_remove = [column for column in missing_percentage.index if missing_percentage[column] > 20 and column != 'Price']
cleaned_data = housing_data.drop(columns=columns_to_remove)

# Drop rows with missing 'Price'
cleaned_data = cleaned_data.dropna(subset=['Price'])

# Define features and target
# Exclude 'Address' from features as it's not numerical and causes the error
X = cleaned_data.drop(columns=['Price', 'Date', 'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname', 'Address'])
y = cleaned_data['Price']

# Fill missing values in numeric columns with mean
numeric_features = X.select_dtypes(include=[float, int])
X[numeric_features.columns] = numeric_features.fillna(numeric_features.mean())

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model and fit it to the training data
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Use the model to select important features
selector = SelectFromModel(rf_model, threshold='mean', prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Convert the selected features back to a DataFrame
X_train_rf_selected = pd.DataFrame(X_train_selected, columns=X.columns[selector.get_support()])
X_test_rf_selected = pd.DataFrame(X_test_selected, columns=X.columns[selector.get_support()])

# Define a function to evaluate model performance
def evaluate_model(train_features, test_features, train_target, test_target):
    model = RandomForestRegressor(random_state=42)
    model.fit(train_features, train_target)
    predictions = model.predict(test_features)
    mse = mean_squared_error(test_target, predictions)
    return mse

# Evaluate the model after feature selection
mse_rf_selection = evaluate_model(X_train_rf_selected, X_test_rf_selected, y_train, y_test)

print(f"Model performance after random forest feature selection: MSE = {mse_rf_selection:.2f}")



Model performance after random forest feature selection: MSE = 147811757361.14


In [11]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
housing_data = pd.read_csv('/content/Melbourne_housing_FULL.csv')

# Handle missing values
missing_percentage = housing_data.isnull().mean() * 100
columns_to_remove = [column for column in missing_percentage.index if missing_percentage[column] > 20 and column != 'Price']
housing_data = housing_data.drop(columns=columns_to_remove)
housing_data = housing_data.dropna(subset=['Price'])

# Select features and target
features = housing_data.drop(columns=['Price', 'Date', 'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname', 'Address'])
target = housing_data['Price']

# Remove highly correlated features
correlation_matrix = features.corr().abs()
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
high_correlation_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.85)]
features_filtered = features.drop(columns=high_correlation_features)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_filtered, target, test_size=0.2, random_state=42)

# Function to evaluate the model
def evaluate_model(train_features, test_features, train_target, test_target):
    model = RandomForestRegressor(random_state=42)
    model.fit(train_features, train_target)
    predictions = model.predict(test_features)
    mse = mean_squared_error(test_target, predictions)
    return mse

# Evaluate the model after filtering
mse_after_correlation_filter = evaluate_model(X_train, X_test, y_train, y_test)

print(f"Model performance after removing highly correlated features: MSE = {mse_after_correlation_filter:.2f}")


Model performance after removing highly correlated features: MSE = 143811304257.53


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import VarianceThreshold

# Load the dataset
housing_data = pd.read_csv('/content/Melbourne_housing_FULL.csv')

# Handle missing values
missing_percentage = housing_data.isnull().mean() * 100
columns_to_remove = [column for column in missing_percentage.index if missing_percentage[column] > 20 and column != 'Price']
housing_data = housing_data.drop(columns=columns_to_remove)
housing_data = housing_data.dropna(subset=['Price'])

# Select features and target
features = housing_data.drop(columns=['Price', 'Date', 'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname', 'Address'])
target = housing_data['Price']

# Remove features with low variance
selector = VarianceThreshold(threshold=0.01)  # Adjust threshold as needed
features_filtered = selector.fit_transform(features)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_filtered, target, test_size=0.2, random_state=42)

# Function to evaluate the model
def evaluate_model(train_features, test_features, train_target, test_target):
    model = RandomForestRegressor(random_state=42)
    model.fit(train_features, train_target)
    predictions = model.predict(test_features)
    mse = mean_squared_error(test_target, predictions)
    return mse

# Evaluate the model after filtering
mse_after_variance_filter = evaluate_model(X_train, X_test, y_train, y_test)

print(f"Model performance after removing features with low variance: MSE = {mse_after_variance_filter:.2f}")

Model performance after removing features with low variance: MSE = 143811304257.53
