In [43]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load the dataset
Database = pd.read_csv("/Users/bhaveshpatidar/Desktop/week 4 - Nashville_housing_data.csv")

# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Check for missing values
missing_values = Database.isnull().sum()
print("Missing Values:")
print(missing_values)


Missing Values:
Unnamed: 0                               0
Parcel ID                                0
Land Use                                 0
Property Address                         2
Suite/ Condo   #                     22651
Property City                            2
Sale Date                                0
Legal Reference                          0
Sold As Vacant                           0
Multiple Parcels Involved in Sale        0
City                                     0
State                                    0
Acreage                                  0
Tax District                             0
Neighborhood                             0
Land Value                               0
Building Value                           0
Finished Area                            1
Foundation Type                          1
Year Built                               0
Exterior Wall                            0
Grade                                    0
Bedrooms                              

In [44]:
import pandas as pd

# Define columns to drop
columns_to_drop = ['Unnamed: 0', 'Suite/ Condo   #','Property City']

# Drop specified columns
Database = Database.drop(columns=columns_to_drop)

# Display remaining columns
remaining_columns = Database.columns
print("Remaining columns after dropping specified columns:")
print(remaining_columns)


Remaining columns after dropping specified columns:
Index(['Parcel ID', 'Land Use', 'Property Address', 'Sale Date',
       'Legal Reference', 'Sold As Vacant',
       'Multiple Parcels Involved in Sale', 'City', 'State', 'Acreage',
       'Tax District', 'Neighborhood', 'Land Value', 'Building Value',
       'Finished Area', 'Foundation Type', 'Year Built', 'Exterior Wall',
       'Grade', 'Bedrooms', 'Full Bath', 'Half Bath',
       'Sale Price Compared To Value'],
      dtype='object')


In [45]:
# Drop rows with missing values
Database.dropna(inplace=True)

# Check if any missing values remain
print("Missing Values after dropping rows:")
print(Database.isnull().sum())


Missing Values after dropping rows:
Parcel ID                            0
Land Use                             0
Property Address                     0
Sale Date                            0
Legal Reference                      0
Sold As Vacant                       0
Multiple Parcels Involved in Sale    0
City                                 0
State                                0
Acreage                              0
Tax District                         0
Neighborhood                         0
Land Value                           0
Building Value                       0
Finished Area                        0
Foundation Type                      0
Year Built                           0
Exterior Wall                        0
Grade                                0
Bedrooms                             0
Full Bath                            0
Half Bath                            0
Sale Price Compared To Value         0
dtype: int64


In [46]:
# Remove duplicate rows
Database.drop_duplicates(inplace=True)

# Check the shape of the dataset after removing duplicates
print("Shape of the dataset after removing duplicates:", Database.shape)

Shape of the dataset after removing duplicates: (22495, 23)


In [48]:
import pandas as pd
import numpy as np

# Assuming DataFrame is already defined as "Database"

# Calculate age of property from 'Year Built'
current_year = pd.Timestamp.now().year
Database['Age'] = current_year - Database['Year Built']

# Calculate total number of bathrooms
Database['Total Bathrooms'] = Database['Full Bath'] + 0.5 * Database['Half Bath']

# Create binary variable indicating presence of a basement
Database['Has Basement'] = np.where(Database['Foundation Type'].str.contains('BSMT'), 1, 0)

# Print first few rows to verify the new features
print(Database.head())


         Parcel ID       Land Use   Property Address Sale Date  \
0  105 11 0 080.00  SINGLE FAMILY   1802  STEWART PL   1/11/13   
1  118 03 0 130.00  SINGLE FAMILY  2761  ROSEDALE PL   1/18/13   
2  119 01 0 479.00  SINGLE FAMILY  224  PEACHTREE ST   1/18/13   
3  119 05 0 186.00  SINGLE FAMILY      316  LUTIE ST   1/23/13   
4  119 05 0 387.00  SINGLE FAMILY   2626  FOSTER AVE    1/4/13   

    Legal Reference Sold As Vacant Multiple Parcels Involved in Sale  \
0  20130118-0006337             No                                No   
1  20130124-0008033             No                                No   
2  20130128-0008863             No                                No   
3  20130131-0009929             No                                No   
4  20130118-0006110             No                                No   

        City State  Acreage             Tax District  Neighborhood  \
0  NASHVILLE    TN     0.17  URBAN SERVICES DISTRICT          3127   
1  NASHVILLE    TN     0.11   

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Define features and target variable
features = ['Age', 'Total Bathrooms', 'Has Basement']  # Add other relevant features
X = Database[features]
y = (Database['Sale Price Compared To Value'] == 'Over').astype(int)  # Convert target variable to binary

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
log_reg_model = LogisticRegression()
log_reg_model.fit(X_train, y_train)

# Predictions on the testing set
y_pred = log_reg_model.predict(X_test)

# Model evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      1130
           1       0.75      1.00      0.86      3369

    accuracy                           0.75      4499
   macro avg       0.87      0.50      0.43      4499
weighted avg       0.81      0.75      0.64      4499



In [33]:
from sklearn.tree import DecisionTreeClassifier

# Define features and target variable
features = ['Age', 'Total Bathrooms', 'Has Basement']  # Add other relevant features
X = Database[features]
y = (Database['Sale Price Compared To Value'] == 'Over').astype(int)  # Convert target variable to binary

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the decision tree model
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)

# Predictions on the testing set
y_pred_tree = decision_tree_model.predict(X_test)

# Model evaluation
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_tree))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.08      0.13      1130
           1       0.76      0.95      0.84      3369

    accuracy                           0.73      4499
   macro avg       0.56      0.52      0.49      4499
weighted avg       0.66      0.73      0.66      4499



In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Features and target variable
X = Database[['Finished Area', 'Year Built']]
y = Database['Sale Price Compared To Value']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))

Random Forest Classification Report:
              precision    recall  f1-score   support

        Over       0.76      0.82      0.79      3369
       Under       0.29      0.22      0.25      1130

    accuracy                           0.67      4499
   macro avg       0.52      0.52      0.52      4499
weighted avg       0.64      0.67      0.65      4499



In [35]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Features and target variable
X = Database[['Finished Area', 'Year Built']]
y = Database['Sale Price Compared To Value']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train the model
gb_classifier.fit(X_train, y_train)

# Make predictions
y_pred = gb_classifier.predict(X_test)

# Evaluate the model
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred))

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

        Over       0.75      0.99      0.86      3369
       Under       0.53      0.03      0.06      1130

    accuracy                           0.75      4499
   macro avg       0.64      0.51      0.46      4499
weighted avg       0.70      0.75      0.65      4499



In [36]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Features and target variable
X = Database[['Finished Area', 'Year Built']]
y = Database['Sale Price Compared To Value']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    "Logistic Regression": LogisticRegression(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Evaluate and compare models
for name, model in models.items():
    print(f"--- {name} ---")
    # Train the model
    model.fit(X_train, y_train)
    # Make predictions
    y_pred = model.predict(X_test)
    # Classification report
    print(classification_report(y_test, y_pred))

--- Logistic Regression ---
              precision    recall  f1-score   support

        Over       0.75      1.00      0.86      3369
       Under       0.67      0.00      0.01      1130

    accuracy                           0.75      4499
   macro avg       0.71      0.50      0.43      4499
weighted avg       0.73      0.75      0.64      4499

--- Gradient Boosting ---
              precision    recall  f1-score   support

        Over       0.75      0.99      0.86      3369
       Under       0.53      0.03      0.06      1130

    accuracy                           0.75      4499
   macro avg       0.64      0.51      0.46      4499
weighted avg       0.70      0.75      0.65      4499

--- Decision Tree ---
              precision    recall  f1-score   support

        Over       0.76      0.80      0.78      3369
       Under       0.28      0.24      0.26      1130

    accuracy                           0.66      4499
   macro avg       0.52      0.52      0.52      449