In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

diabetes_data = pd.read_csv('diabetes.csv')
print(diabetes_data.head())
X = diabetes_data.drop(columns=['Outcome'])
y = diabetes_data['Outcome']

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [None]:
corr_matrix = X.corr()
print("Correlation Matrix:")
print(corr_matrix)

Correlation Matrix:
                          Pregnancies   Glucose  BloodPressure  SkinThickness  \
Pregnancies                  1.000000  0.129459       0.141282      -0.081672   
Glucose                      0.129459  1.000000       0.152590       0.057328   
BloodPressure                0.141282  0.152590       1.000000       0.207371   
SkinThickness               -0.081672  0.057328       0.207371       1.000000   
Insulin                     -0.073535  0.331357       0.088933       0.436783   
BMI                          0.017683  0.221071       0.281805       0.392573   
DiabetesPedigreeFunction    -0.033523  0.137337       0.041265       0.183928   
Age                          0.544341  0.263514       0.239528      -0.113970   

                           Insulin       BMI  DiabetesPedigreeFunction  \
Pregnancies              -0.073535  0.017683                 -0.033523   
Glucose                   0.331357  0.221071                  0.137337   
BloodPressure             0.

In [None]:
high_corr_pairs = [(i, j) for i in corr_matrix.columns for j in corr_matrix.columns
                   if i != j and abs(corr_matrix.loc[i, j]) > 0.8]
print("Highly Correlated Pairs:", high_corr_pairs)


Highly Correlated Pairs: []


In [None]:
to_remove = {pair[1] for pair in high_corr_pairs}
X_reduced = X.drop(columns=to_remove)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_reduced_train, X_reduced_test = train_test_split(X_reduced, test_size=0.3, random_state=42)[0:2]


In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Original Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Original Model Performance:
Accuracy: 0.7532467532467533
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.80      0.81       151
           1       0.64      0.66      0.65        80

    accuracy                           0.75       231
   macro avg       0.73      0.73      0.73       231
weighted avg       0.76      0.75      0.75       231



In [None]:
model_reduced = RandomForestClassifier(random_state=42)
model_reduced.fit(X_reduced_train, y_train)
y_reduced_pred = model_reduced.predict(X_reduced_test)
print("\nModel Performance After Removing Highly Correlated Features:")
print("Accuracy:", accuracy_score(y_test, y_reduced_pred))
print("Classification Report:\n", classification_report(y_test, y_reduced_pred))


Model Performance After Removing Highly Correlated Features:
Accuracy: 0.7532467532467533
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.80      0.81       151
           1       0.64      0.66      0.65        80

    accuracy                           0.75       231
   macro avg       0.73      0.73      0.73       231
weighted avg       0.76      0.75      0.75       231



Melbourne Dataset:

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv('melbourne.csv')

# Encode categorical features using Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == object:  # Check if the column is of type object (string)
        df[column] = label_encoder.fit_transform(df[column])

# Check for missing values
miss_percent2 = (df.isnull().sum() / len(df)) * 100
drop_percent2 = df.columns[miss_percent2 > 22]
print("Columns with more than 22% missing values:", drop_percent2)
data2_dropped = df.drop(columns=drop_percent2, axis=1)
print("Dataset after dropping columns with more than 22% missing values:")
print(data2_dropped.head())

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X2 = data2_dropped.drop(columns=['Price'])
X2_imputed = imputer.fit_transform(X2)

# Target variable
y2 = data2_dropped['Price']
y2_imputed = imputer.fit_transform(y2.values.reshape(-1, 1)).ravel()

# Step 2: Calculate correlation matrix for X2_imputed
X2_df = pd.DataFrame(X2_imputed, columns=X2.columns)
corr_matrix = X2_df.corr()

# Step 3: Identify highly correlated features (correlation > 0.85)
threshold = 0.85
high_corr_var = np.where(np.abs(corr_matrix) > threshold)
high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y])
                   for x, y in zip(*high_corr_var)
                   if x != y and x < y]  # Avoid duplicates and self-pairs

print("Correlation Matrix:")
print(corr_matrix)

print("Highly correlated pairs (correlation > 0.85):", high_corr_pairs)

# Drop one feature from each pair of highly correlated features
features_to_drop = {pair[1] for pair in high_corr_pairs}
X2_changed = X2_df.drop(columns=features_to_drop)

# Step 4: Train-test split for original and reduced feature sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_imputed, y2_imputed, test_size=0.3, random_state=42)
X2_changed_train, X2_changed_test, y2_changed_train, y2_changed_test = train_test_split(X2_changed, y2_imputed, test_size=0.3, random_state=42)

# Step 5: Train and evaluate model on original feature set (X2)
model_original = RandomForestRegressor(random_state=42)
model_original.fit(X2_train, y2_train)
y2_pred = model_original.predict(X2_test)

# Evaluate performance (Original)
original_rmse = mean_squared_error(y2_test, y2_pred, squared=False)
original_r2 = r2_score(y2_test, y2_pred)
print("\nPerformance with original features:")
print("RMSE:", original_rmse)
print("R²:", original_r2)

# Step 6: Train and evaluate model on reduced feature set (X2_changed)
model_reduced = RandomForestRegressor(random_state=42)
model_reduced.fit(X2_changed_train, y2_changed_train)
y2_changed_pred = model_reduced.predict(X2_changed_test)

# Evaluate performance (Reduced)
reduced_rmse = mean_squared_error(y2_changed_test, y2_changed_pred, squared=False)
reduced_r2 = r2_score(y2_changed_test, y2_changed_pred)
print("\nPerformance with reduced features:")
print("RMSE:", reduced_rmse)
print("R²:", reduced_r2)

# Step 7: Compare the difference in RMSE and R²
rmse_difference = original_rmse - reduced_rmse
r2_difference = original_r2 - reduced_r2
print("\nRMSE difference (Original - Reduced):", rmse_difference)
print("R² difference (Original - Reduced):", r2_difference)


Columns with more than 22% missing values: Index(['Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt',
       'Lattitude', 'Longtitude'],
      dtype='object')
Dataset after dropping columns with more than 22% missing values:
   Suburb  Rooms  Type      Price  Method  SellerG  Date  Distance  Postcode  \
0       0      2     0        NaN       6      155    59       2.5    3067.0   
1       0      2     0  1480000.0       2       33    55       2.5    3067.0   
2       0      2     0  1035000.0       2       33    64       2.5    3067.0   
3       0      3     2        NaN       7      296    64       2.5    3067.0   
4       0      3     0  1465000.0       5       33    65       2.5    3067.0   

   CouncilArea  Regionname  Propertycount  
0           31           2         4019.0  
1           31           2         4019.0  
2           31           2         4019.0  
3           31           2         4019.0  
4           31           2         4019.0  
Correlatio




Performance with original features:
RMSE: 424937.253474199
R²: 0.4594153634117968

Performance with reduced features:
RMSE: 424937.253474199
R²: 0.4594153634117968

RMSE difference (Original - Reduced): 0.0
R² difference (Original - Reduced): 0.0




In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


In [None]:
# Load the dataset from CSV
melbourne_data = pd.read_csv('melbourne.csv')

# Display the first few rows of the dataset
print(melbourne_data.head())


       Suburb  Rooms Type      Price Method SellerG     Date  Distance  \
0  Abbotsford      2    h        NaN     SS  Jellis   3/9/16       2.5   
1  Abbotsford      2    h  1480000.0      S  Biggin  3/12/16       2.5   
2  Abbotsford      2    h  1035000.0      S  Biggin   4/2/16       2.5   
3  Abbotsford      3    u        NaN     VB  Rounds   4/2/16       2.5   
4  Abbotsford      3    h  1465000.0     SP  Biggin   4/3/17       2.5   

   Postcode  Bedroom2  Bathroom  Car  Landsize  BuildingArea  YearBuilt  \
0    3067.0       2.0       1.0  1.0     126.0           NaN        NaN   
1    3067.0       2.0       1.0  1.0     202.0           NaN        NaN   
2    3067.0       2.0       1.0  0.0     156.0          79.0     1900.0   
3    3067.0       3.0       2.0  1.0       0.0           NaN        NaN   
4    3067.0       3.0       2.0  0.0     134.0         150.0     1900.0   

          CouncilArea  Lattitude  Longtitude             Regionname  \
0  Yarra City Council   -37.8014 

In [None]:
# Encode categorical features using Label Encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for column in melbourne_data.columns:
    if melbourne_data[column].dtype == object:  # Check if the column is of type object (string)
        melbourne_data[column] = label_encoder.fit_transform(melbourne_data[column])

In [None]:
# Impute missing values with mean for numerical columns
imputer = SimpleImputer(strategy='mean')
numerical_columns = melbourne_data.select_dtypes(include=['float64', 'int64']).columns
melbourne_data[numerical_columns] = imputer.fit_transform(melbourne_data[numerical_columns])

# Verify missing values have been handled
print(melbourne_data.isnull().sum())


Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
YearBuilt        0
CouncilArea      0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
dtype: int64


In [None]:
# Define features (X) and target variable (y)
X = melbourne_data.drop(columns=['Price'])  # Assuming 'Price' is the target variable
y = melbourne_data['Price']


In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Step 2: Calculate correlation matrix for X2_imputed
X_df = pd.DataFrame(X, columns=X.columns)
corr_matrix = X_df.corr()

In [None]:
# Step 3: Identify highly correlated features (correlation > 0.85)
threshold = 0.85
high_corr_var = np.where(np.abs(corr_matrix) > threshold)
high_corr_pairs = [(corr_matrix.index[x], corr_matrix.columns[y])
                   for x, y in zip(*high_corr_var)
                   if x != y and x < y]  # Avoid duplicates and self-pairs

print("Correlation Matrix:")
print(corr_matrix)



Correlation Matrix:
                 Suburb     Rooms      Type    Method   SellerG      Date  \
Suburb         1.000000 -0.064271  0.022750  0.001405  0.034903 -0.002352   
Rooms         -0.064271  1.000000 -0.550992 -0.029778 -0.040063 -0.011284   
Type           0.022750 -0.550992  1.000000  0.043113  0.030186  0.005767   
Method         0.001405 -0.029778  0.043113  1.000000  0.002896  0.000023   
SellerG        0.034903 -0.040063  0.030186  0.002896  1.000000 -0.000102   
Date          -0.002352 -0.011284  0.005767  0.000023 -0.000102  1.000000   
Distance      -0.007840  0.271511 -0.235796 -0.041119 -0.025242  0.000767   
Postcode      -0.014178  0.085890 -0.030485 -0.009798 -0.005505  0.001195   
Bedroom2      -0.048451  0.819099 -0.404734 -0.014759 -0.032397 -0.010533   
Bathroom      -0.047342  0.529191 -0.179315  0.002761 -0.026332 -0.012018   
Car           -0.026869  0.337780 -0.210571 -0.009085  0.002093 -0.000565   
Landsize       0.000577  0.030136 -0.016931  0.006994 -0

In [None]:
print("Highly correlated pairs (correlation > 0.85):", high_corr_pairs)

# Drop one feature from each pair of highly correlated features
features_to_drop = {pair[1] for pair in high_corr_pairs}
X_changed = X_df.drop(columns=features_to_drop)

Highly correlated pairs (correlation > 0.85): []


In [None]:
# Train a Random Forest Regressor without variance filtering (this step can be skipped if you only want to analyze the filtered model)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate performance
y_pred = model.predict(X_test)
initial_mse = mean_squared_error(y_test, y_pred)
initial_r2 = r2_score(y_test, y_pred)

print(f"Initial Mean Squared Error without high correlation filter: {initial_mse:.4f}")
print(f"R² Score before applying high correlation filter: {initial_r2:.4f}")


Initial Mean Squared Error without high correlation filter: 142803471996.8225
R² Score before applying high correlation filter: 0.5178


In [None]:
# Train the model with filtered features
X_train_filtered, X_test_filtered, y_train, y_test = train_test_split(X_changed, y, test_size=0.2, random_state=42)
model_filtered = RandomForestRegressor(random_state=42)
model_filtered.fit(X_train_filtered, y_train)

# Make predictions and evaluate performance
y_pred_filtered = model_filtered.predict(X_test_filtered)
filtered_mse = mean_squared_error(y_test, y_pred_filtered)

filtered_r2 = r2_score(y_test, y_pred_filtered)

print(f"Mean Squared Error after applying high correlation filter (Regression): {filtered_mse:.4f}")
print(f"R² Score after applying high correlation filter: {filtered_r2:.4f}")


Mean Squared Error after applying high correlation filter (Regression): 142803471996.8225
R² Score after applying high correlation filter: 0.5178
