In [16]:
import warnings
warnings.filterwarnings('ignore')

In [17]:
# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression

In [18]:
# Load the data
file_path = Path('./Resources/Clean_MC_Master.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,MLSNumber,Address,SoldPrice,CurrentPrice,ListDate,SettledDate,#ofStories,City,Zip Code,Subdivision,New Construction YN,Age,InteriorSqFt,Bedrooms,Baths,Garage YN,Structure Type
0,1002388281,9701 Fields Rd #1806,"$127,000","$129,900",11/9/2015,1/4/2016,Main,Gaithersburg,20878,WASHINGTON TOWER CODM,No,1966,446.0,0.0,1.0,No,Unit/Flat/Apartment
1,1002388133,2211 Washington Ave #W-102,"$202,000","$207,000",11/9/2015,1/4/2016,Main,Silver Spring,20910,ROCK CREEK APTS CODM 2,No,1948,671.0,1.0,1.0,No,Unit/Flat/Apartment
2,1002384775,3117 University Blvd W #B4,"$139,900","$139,900",10/28/2015,1/4/2016,Main,Kensington,20895,MONTGOMERY CENTURY,No,1973,754.0,1.0,1.0,No,Unit/Flat/Apartment
3,1002382327,10201 Grosvenor Pl #210,"$195,000","$199,900",10/15/2015,1/4/2016,Main,Rockville,20852,GROSVENOR PARK,No,1972,851.0,1.0,1.0,No,Unit/Flat/Apartment
4,1002382267,10301 Rossmore Ct,"$840,000","$850,000",10/22/2015,1/4/2016,"Lower1,Lower2,Main,Upper1",Bethesda,20814,WILDWOOD KNOLLS,No,1963,3060.0,4.0,4.0,Yes,Detached
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61797,MDMC2005770,1 Paca Pl,"$625,000","$625,000",7/22/2021,9/16/2021,"Main,Upper1",Rockville,20852,HUNGERFORD,No,1955,2237.0,4.0,3.0,Yes,Detached
61798,MDMC753990,1108 Clagett Dr,"$499,500","$509,000",7/15/2021,9/16/2021,Main,Rockville,20851,ROCKCREST,No,1951,1457.0,3.0,3.0,Yes,Detached
61799,MDMC2003756,11307 Galt Ave,"$410,000","$445,000",7/26/2021,9/16/2021,"Lower1,Main,Upper1",Silver Spring,20902,WHEATON HILLS,No,1950,1872.0,4.0,2.0,No,Detached
61800,MDMC763464,8809 Thomas Lea Ter,"$400,000","$374,900",6/24/2021,9/16/2021,"Lower1,Main,Upper1",Montgomery Village,20886,THE REACH,No,1986,2160.0,4.0,4.0,No,Interior Row/Townhouse


In [19]:
# Drop MLS and Address columns
housing_df = df.drop(['MLSNumber', 'Address', 'ListDate', 'SettledDate', 'Subdivision', 'City'], axis=1)
housing_df.head()

Unnamed: 0,SoldPrice,CurrentPrice,#ofStories,Zip Code,New Construction YN,Age,InteriorSqFt,Bedrooms,Baths,Garage YN,Structure Type
0,"$127,000","$129,900",Main,20878,No,1966,446.0,0.0,1.0,No,Unit/Flat/Apartment
1,"$202,000","$207,000",Main,20910,No,1948,671.0,1.0,1.0,No,Unit/Flat/Apartment
2,"$139,900","$139,900",Main,20895,No,1973,754.0,1.0,1.0,No,Unit/Flat/Apartment
3,"$195,000","$199,900",Main,20852,No,1972,851.0,1.0,1.0,No,Unit/Flat/Apartment
4,"$840,000","$850,000","Lower1,Lower2,Main,Upper1",20814,No,1963,3060.0,4.0,4.0,Yes,Detached


In [20]:
# Convert SoldPrice and CurrentPrice to numberical
housing_df['SoldPrice'] = housing_df['SoldPrice'].str.replace(',', '').str.replace('$', '').astype(int)
housing_df['CurrentPrice'] = housing_df['CurrentPrice'].str.replace(',', '').str.replace('$', '').astype(int)
housing_df.head()

Unnamed: 0,SoldPrice,CurrentPrice,#ofStories,Zip Code,New Construction YN,Age,InteriorSqFt,Bedrooms,Baths,Garage YN,Structure Type
0,127000,129900,Main,20878,No,1966,446.0,0.0,1.0,No,Unit/Flat/Apartment
1,202000,207000,Main,20910,No,1948,671.0,1.0,1.0,No,Unit/Flat/Apartment
2,139900,139900,Main,20895,No,1973,754.0,1.0,1.0,No,Unit/Flat/Apartment
3,195000,199900,Main,20852,No,1972,851.0,1.0,1.0,No,Unit/Flat/Apartment
4,840000,850000,"Lower1,Lower2,Main,Upper1",20814,No,1963,3060.0,4.0,4.0,Yes,Detached


In [21]:
# Determine the number of unique values in each column.
application_cat = housing_df.dtypes.index.tolist()
housing_df[application_cat].nunique()

SoldPrice              7368
CurrentPrice           5961
#ofStories               46
Zip Code                 50
New Construction YN       2
Age                     175
InteriorSqFt           6448
Bedrooms                 13
Baths                    20
Garage YN                 2
Structure Type            8
dtype: int64

In [7]:
# Determine number of different stories
housing_df['#ofStories'].unique()

array(['Main', 'Lower1,Lower2,Main,Upper1', 'Lower1,Main,Upper1',
       'Lower1,Main', 'Main,Upper1', 'Lower1,Main,Upper1,Upper2',
       'Main,Upper2', 'Main,Upper1,Upper2', 'Upper2', 'Lower1,Upper1',
       'Lower1,Lower2,Main,Upper1,Upper2', 'Lower1,Main,Upper2', 'Upper1',
       'Lower1', 'Lower2,Main,Upper1', 'Lower1,Upper1,Upper2',
       'Upper1,Upper2', 'Lower2,Main', 'Lower1,Lower2,Main',
       'Lower1,Upper2', 'Lower2', 'Lower2,Upper1',
       'Lower1,Lower2,Main,Upper1,Upper2,Upper3', 'Lower1,Lower2,Upper1',
       'Lower2,Main,Upper2', 'Lower2,Main,Upper1,Upper2',
       'Lower1,Lower2,Upper1,Upper2', 'Lower1,Lower2',
       'Lower1,Main,Upper1,Upper2,Upper3', 'Upper1,Upper2,Upper3',
       'Lower1,Main,Upper1,Upper3', 'Main,Upper1,Upper2,Upper3',
       'Lower1,Lower2,Main,Upper2,Upper3', 'Lower1,Lower3,Main,Upper1',
       'Lower1,Main,Upper2,Upper3', 'Lower1,Lower2,Lower3,Main,Upper1',
       'Lower1,Lower2,Lower3,Upper1', 'Lower1,Upper1,Upper2,Upper3',
       'Upper1,

In [22]:
# Convert text to numbers
clean_housing_df = pd.get_dummies(housing_df)
clean_housing_df.head()

Unnamed: 0,SoldPrice,CurrentPrice,Zip Code,Age,InteriorSqFt,Bedrooms,Baths,#ofStories_Lower1,"#ofStories_Lower1,Lower2","#ofStories_Lower1,Lower2,Lower3,Main",...,Garage YN_No,Garage YN_Yes,Structure Type _Detached,Structure Type _End of Row/Townhouse,Structure Type _Garage/Parking Space,Structure Type _Interior Row/Townhouse,Structure Type _Other,Structure Type _Penthouse Unit/Flat/Apartment,Structure Type _Twin/Semi-Detached,Structure Type _Unit/Flat/Apartment
0,127000,129900,20878,1966,446.0,0.0,1.0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,202000,207000,20910,1948,671.0,1.0,1.0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,139900,139900,20895,1973,754.0,1.0,1.0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,195000,199900,20852,1972,851.0,1.0,1.0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,840000,850000,20814,1963,3060.0,4.0,4.0,0,0,0,...,0,1,1,0,0,0,0,0,0,0


In [23]:
# Create features
X = clean_housing_df.drop('SoldPrice', axis=1)

# Create target
y = clean_housing_df['SoldPrice']

In [24]:
X.describe()

Unnamed: 0,CurrentPrice,Zip Code,Age,InteriorSqFt,Bedrooms,Baths,#ofStories_Lower1,"#ofStories_Lower1,Lower2","#ofStories_Lower1,Lower2,Lower3,Main","#ofStories_Lower1,Lower2,Lower3,Main,Upper1",...,Garage YN_No,Garage YN_Yes,Structure Type _Detached,Structure Type _End of Row/Townhouse,Structure Type _Garage/Parking Space,Structure Type _Interior Row/Townhouse,Structure Type _Other,Structure Type _Penthouse Unit/Flat/Apartment,Structure Type _Twin/Semi-Detached,Structure Type _Unit/Flat/Apartment
count,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0,...,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0,61802.0
mean,565197.8,20868.874244,1979.323744,20309.29,3.358047,3.100951,0.001246,1.6e-05,3.2e-05,0.000243,...,0.531585,0.468415,0.506505,0.082441,0.000275,0.180463,0.005615,0.006537,0.007233,0.210932
std,446638.2,32.004314,24.438319,4469463.0,1.240169,1.336368,0.035276,0.004023,0.005689,0.015577,...,0.499005,0.499005,0.499962,0.275037,0.016583,0.384576,0.074721,0.080588,0.084738,0.407973
min,398.0,20705.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,309900.0,20852.0,1966.0,1311.0,3.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,450000.0,20874.0,1982.0,1949.0,3.0,3.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,675000.0,20895.0,1994.0,2815.0,4.0,4.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,16950000.0,21771.0,2022.0,1111111000.0,12.0,23.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
# Split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [26]:
# Create a logistic regression model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [27]:
# Train the data
classifier.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Predict outcomes for test data set
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Generate accuracy score
accuracy_score(y_test, predictions)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))