<a href="https://colab.research.google.com/github/mirasmitty/Project_Zillow/blob/molleigh/Project_Zillow_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Split the Data into Training and Testing Sets**<br></br>

### **Step 1:** Read the lending_data.csv data from the Resources folder into a Pandas DataFrame.

In [14]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report


In [16]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# Loading data
df = pd.read_csv("https://raw.githubusercontent.com/mirasmitty/Project_Zillow/main/Resources/Zillow_data_Detroit.csv")
df['Week of pending'] = pd.to_datetime(df['Week of pending'])
df['Week of pending'] = df['Week of pending'].values.astype("int64")

# Review the DataFrame
df.head()

Unnamed: 0,Week of pending,Mean days listing to pending,Mean price reduction percentage
0,1517011200000000000,49,0.045619
1,1517616000000000000,48,0.046532
2,1518220800000000000,47,0.048536
3,1518825600000000000,46,0.049169
4,1519430400000000000,43,0.048168



### **Step 2:** Create the labels set (y) from the “loan_status” column, and then create the features (X) DataFrame from the remaining columns.

In [17]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df["Mean price reduction percentage"]

# Separate the X variable, the features
x = df.drop(columns=['Mean price reduction percentage'])


In [18]:
# Review the y variable Series
y[:5]


0    0.045619
1    0.046532
2    0.048536
3    0.049169
4    0.048168
Name: Mean price reduction percentage, dtype: float64

In [19]:
# Review the X variable DataFrame
x[:5]

Unnamed: 0,Week of pending,Mean days listing to pending
0,1517011200000000000,49
1,1517616000000000000,48
2,1518220800000000000,47
3,1518825600000000000,46
4,1519430400000000000,43



### **Step 3:** Check the balance of the labels variable (y) by using the value_counts function.

In [20]:
# Check the balance of our target values
y.value_counts()

0.045619    1
0.051982    1
0.053724    1
0.053300    1
0.051753    1
0.052866    1
0.055280    1
0.055631    1
0.053722    1
0.050454    1
0.056315    1
0.049650    1
0.050125    1
0.050378    1
0.047866    1
0.047123    1
0.045601    1
0.045761    1
0.054597    1
0.055683    1
0.043061    1
0.049253    1
0.046162    1
0.046104    1
0.046359    1
0.047222    1
0.047829    1
0.048121    1
0.048495    1
0.050773    1
0.056189    1
0.052223    1
0.053757    1
0.053530    1
0.053945    1
0.054528    1
0.055790    1
0.056798    1
0.046843    1
0.045798    1
0.046940    1
0.046210    1
0.047247    1
0.047235    1
0.047233    1
0.046522    1
0.045901    1
0.046424    1
0.045528    1
0.045870    1
0.045268    1
0.043907    1
0.044843    1
0.044800    1
0.046553    1
0.047386    1
0.047171    1
0.045198    1
0.046643    1
0.045805    1
0.045522    1
0.044850    1
0.044360    1
0.044613    1
0.043009    1
0.043563    1
0.043809    1
0.044134    1
0.045389    1
0.045845    1
0.045695    1
0.0457


### **Step 4:** Split the data into training and testing datasets by using train_test_split.


---



---



---

##**Cannot run code from here down: y has NaN Values**


---



---



---



In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, stratify=y)

# Create a Logistic Regression Model with the Original Data




### **Step 1:** Fit a logistic regression model by using the training data (X_train and y_train).

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(x_train, y_train)



### **Step 2:** Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.

In [None]:

# Make a prediction using the testing data
testing_predictions = lr_model.predict(x_test)

### **Step 3:** Evaluate the model’s performance by doing the following:
*   Calculate the accuracy score of the model.
*   Generate a confusion matrix.
*   Print the classification report.



In [None]:
# Print the balanced_accuracy score of the model

print(f"Training Data Score: {lr_model.score(x_train,y_train)}")
print(f"Testing Data Score: {lr_model.score(x_test,y_test)}")
print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, testing_predictions)}")
print(f"The balanced accuracy with the original data is {testing_accuracy* 100:.2f}%")

In [None]:
# Generate a confusion matrix for the model
testing_confusion_matrix = confusion_matrix(y_test, testing_prediction)

print(f'The confusion matrix with the original data is:\n{testing_confusion_matrix}')

In [None]:
# Print the classification report for the model
testing_report = classification_report(y_test, testing_prediction)

#View the results
print(f'The classification report with the original data shows:\n{testing_report}')

**Step 4:** Answer the following question.

**Question:** How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

**Answer:** HEALTHY LOANS: For healthy loans, the precision is 1.00, the recall is 1.00 and the f1-score is 1.00, meaning the model is performing exceptionally well in identifying healthy loans without missing any. This scenario is ideal, indicating a high level of confidence in the model's ability to identify healthy loans.

**HIGH-RISK LOANS:** The precision is 0.87 for high-risk loans, so the remaining 13% are false-positives. The recall is 0.89, so the remaining 11% are false negatives. The f1-score is 0.88, which suggests a good balance between precision and recall for high-risk loans. Therefore, the model performs moderately well for high-risk loans, but there is still room for improvement; it is highly likely that the financial field would require 95% or higher in order to retain confidence in the model.

# Predict a Logistic Regression Model with Resampled Training Data


### **Step 1:** Use the RandomOverSampler module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points.

In [None]:

# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros_model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
x_res, y_res = ros_model.fit_resample(x_train, y_train)

In [None]:
# Count the distinct values of the resampled labels data

#y_res.value_counts()
print(y_res.value_counts())


# Check that my numbers are the same on both sides
unique_values, counts = np.unique(y_res, return_counts=True)
print("Resampled Labels: ", unique_values)
print("Label Counts: ", counts)


### **Step 2:** Use the LogisticRegression classifier and the resampled data to fit the model and make predictions.

In [None]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_ros_model = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
lr_ros_model.fit(x_res, y_res)

# Make a prediction using the testing data
testing_ros_predictions = lr_ros_model.predict(x_test)


### **Step 3:** Evaluate the model’s performance by doing the following:

*   Calculate the accuracy score of the model.
*   Generate a confusion matrix.
*   Print the classification report.





In [None]:
# Print the balanced_accuracy score of the model
print(f"Training Data Score: {lr_ros_model.score(x_res,y_res)}")
print(f"Testing Data Score: {lr_ros_model.score(x_test,y_test)}")

ros_balanced_accuracy_score = balanced_accuracy_score(y_test, testing_ros_predictions)
print(f"Balanced Accuracy Score: {ros_balanced_accuracy_score}")
print(f"The balanced accuracy with the original data is {ros_balanced_accuracy_score* 100:.2f}%")

In [None]:
# Generate a confusion matrix for the model
ros_confusion_matrix = confusion_matrix(y_test, testing_ros_predictions)
print(f'The confusion matrix with the oversampled data is:\n{ros_confusion_matrix}')

In [None]:
# Print the classification report for the model
ros_classification_report = classification_report(y_test, testing_ros_predictions)

#View the results
print(f'The classification report with the oversampled data shows:\n{ros_classification_report}')

###**Step 4:** Answer the following question
**Question:** How well does the logistic regression model, fit with oversampled data, predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

**Answer: **The logistic regression model, fit with the oversampled data, better predicts the healthy and high-risk loan labels than our first report. The healthy loans precision, recall, and f1-score remain the same as before; likewise, the high-risk loan labels have the same precision measurement. However, the recall has gone up 0.01 to a perfect "1.00", while the oversampled f1-score has increase the most, by 0.05, bringing it's score to "0.93". These changes make the logistic regression model, fit with the oversampled data, does a better job in catching the incorrect labelling of high-risk loans as healthy..





---





---


# Import necessary libraries

#### ATTEMPT ADVANCED REGRESSION TECHNIQUES WITH OUR DATA
(from here down↓)





In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_rows', None)

In [None]:
plt.figure(figsize=(18,6))
plt.title('Heatmap of missing values')
sns.heatmap(df_combined.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
unique_values = []
for col in object_cols:
  unique_values.append(df_combined[col].unique().size)
plt.figure(figsize=(18,6))
plt.title('No. Unique values of Categorical Features')
plt.xticks(rotation=90)
sns.barplot(x=object_cols,y=unique_values)

In [None]:
plt.figure(figsize=(18,36))
plt.title('Categorical Features: Distribution')
plt.xticks(rotation=90)
index = 1
for col in object_cols:
  y = df_combined[col].value_counts()
  plt.subplot(11,4,index)
  plt.xticks(rotation=90)
  sns.barplot(x=list(y.index), y=y)
  index +=1


### **Fill up missing values:**
* Drop the features 'Alley', 'Fence', and 'MiscFeature'.

* Drop 'Utilities' feature, as all but one have the value 'AllPub'

* All entries with missing 'FirePlaceQu' have 'Fireplaces' = 0. Hence fill missing values with 'NA'.

* All but one entries with missing 'PoolQC' value have 'PoolArea' = 0. Use mode for missing value with non-zero PoolArea. Use 'NA' for the rest of the entries.

* **Basement features:** Fill missing values with 'NA' or '0'.

* **Garage features:** Fill missing values with 'NA' or '0'.

* **Remaining Integer and Real features:** fill up missing values with mean of the corresponding feature.

* **Remaining Categorical features:** fill up missing values with mode of the corresponding feature.

In [None]:
df_combined.drop(columns='Id',inplace=True); print('Drop Id \n')
df_combined['MSZoning'] = df_combined['MSZoning'].fillna(df_combined['MSZoning'].mode()[0])
df_combined['LotFrontage'] = df_combined['LotFrontage'].fillna(df_combined['LotFrontage'].mean())
df_combined.drop(columns='Alley',inplace=True); print('Drop Alley \n')

#df_combined['Utilities'] = df_combined['Utilities'].fillna(df_combined['Utilities'].mode()[0])
print(df_combined['Utilities'].value_counts())
df_combined.drop(columns='Utilities',inplace=True); print('Drop Utilities \n')

df_combined['Exterior1st'] = df_combined['Exterior1st'].fillna(df_combined['Exterior1st'].mode()[0])
df_combined['Exterior2nd'] = df_combined['Exterior2nd'].fillna(df_combined['Exterior2nd'].mode()[0])
df_combined['MasVnrType'] = df_combined['MasVnrType'].fillna(df_combined['MasVnrType'].mode()[0])
df_combined['MasVnrArea'] = df_combined['MasVnrArea'].fillna(df_combined['MasVnrArea'].mean())
df_combined['Electrical'] = df_combined['Electrical'].fillna(df_combined['Electrical'].mode()[0])
df_combined['KitchenQual'] = df_combined['KitchenQual'].fillna(df_combined['KitchenQual'].mode()[0])
df_combined['Functional'] = df_combined['Functional'].fillna(df_combined['Functional'].mode()[0])

#df_combined.loc[(df_combined['Fireplaces'] != 0) & (df_combined['FireplaceQu'].isnull()) ][['FireplaceQu','Fireplaces']]
df_combined['FireplaceQu'] = df_combined['FireplaceQu'].fillna('NA'); print('FirePlaceQu: Fill NA values for missing values \n')

df_combined.loc[(df_combined['PoolQC'].isnull()) & df_combined['PoolArea']>0][['PoolQC','PoolArea']]
df_combined.at[2599,'PoolQC'] = df_combined['PoolQC'].mode()[0]; print('PoolQC: Use mode for missing value with non-zero PoolAre \n')
df_combined['PoolQC'] = df_combined['PoolQC'].fillna('NA'); print('PoolQC: Use NA for remaining missing values \n')

df_combined['SaleType'].fillna(df_combined['SaleType'].mode()[0],inplace=True)
df_combined.drop(columns=['Fence','MiscFeature','SalePrice'],inplace=True); print('Drop Fence, MiscFeature and SalePrice\n')

# Basement Features
#df_combined.loc[df_combined['BsmtQual'].isnull()][['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']].head()
#df_combined.loc[df_combined['TotalBsmtSF'].isnull()][['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath']]
print('Fill missing values of Basement features with NA or 0 \n')
df_combined['BsmtQual'] = df_combined['BsmtQual'].fillna('NA')
df_combined['BsmtCond'] = df_combined['BsmtCond'].fillna('NA')
df_combined['BsmtExposure'] = df_combined['BsmtExposure'].fillna('NA')
df_combined['BsmtFinType1'] = df_combined['BsmtFinType1'].fillna('NA')
df_combined['BsmtFinType2'] = df_combined['BsmtFinType2'].fillna('NA')

df_combined['BsmtFinSF1'] = df_combined['BsmtFinSF1'].fillna(int(0))
df_combined['BsmtFinSF2'] = df_combined['BsmtFinSF2'].fillna(int(0))
df_combined['BsmtUnfSF'] = df_combined['BsmtUnfSF'].fillna(int(0))
df_combined['TotalBsmtSF'] = df_combined['TotalBsmtSF'].fillna(int(0))
df_combined['BsmtFullBath'] = df_combined['BsmtFullBath'].fillna(int(0))
df_combined['BsmtHalfBath'] = df_combined['BsmtHalfBath'].fillna(int(0))

# Garage Features
# df_combined.loc[df_combined['GarageCond'].isnull()][['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual','GarageCond']].head()
print('Fill missing values of Garage features with NA or 0 \n')
df_combined['GarageType'] = df_combined['GarageType'].fillna('NA')
df_combined['GarageFinish'] = df_combined['GarageFinish'].fillna('NA')
df_combined['GarageCond'] = df_combined['GarageCond'].fillna('NA')
df_combined['GarageQual'] = df_combined['GarageQual'].fillna('NA')
df_combined['GarageCars'] = df_combined['GarageCars'].fillna(int(0))
df_combined['GarageArea'] = df_combined['GarageArea'].fillna(int(0))
df_combined['GarageYrBlt'] = df_combined['GarageYrBlt'].fillna(int(0))

In [None]:
df_combined.head()

##Check that all missing values have been taken care of.


In [None]:
print(df_combined.isnull().sum().sum())


#Data Preprocessing


###Import Onehot encoder to encode categorical features


In [None]:
from sklearn.preprocessing import OneHotEncoder


In [None]:
s = (df_combined.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)
print('No. of. categorical features: ',len(object_cols))

In [None]:
OH_encoder = OneHotEncoder(sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(df_combined[object_cols]))
OH_cols.index = df_combined.index
OH_cols.columns = OH_encoder.get_feature_names()
df_final = df_combined.drop(object_cols, axis=1)
df_final = pd.concat([df_final, OH_cols], axis=1)

In [None]:
df_final.head()


In [None]:
# Check that the shapes are consistent

print('df_final shape:', df_final.shape)
print('df_train shape:', df_Train.shape)
print('df_test shape:',  df_Test.shape)

X_Train = pd.DataFrame(df_final[:1460])
X_Test  = pd.DataFrame(df_final[1460:])
Y_Train = df_Train['SalePrice']

print('\nCheck that the datasets are consistent:\n')
print('X_train shape', X_Train.shape)
print('Y_train shape:', Y_Train.shape)
print('X_test shape:',  X_Test.shape)

#Model selection and prediction


In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Split the training set into training and validation set

X_train, X_valid, Y_train, Y_valid = train_test_split(X_Train, Y_Train, train_size=0.8, test_size=0.2,random_state=0)


In [None]:
from sklearn.ensemble import RandomForestRegressor

model_RFR = RandomForestRegressor()
model_RFR.fit(X_train, Y_train)
Y_pred = model_RFR.predict(X_valid)
print(mean_absolute_error(Y_valid, Y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

model_GBR = GradientBoostingRegressor()
model_GBR.fit(X_train, Y_train)
Y_pred = model_GBR.predict(X_valid)
print(mean_absolute_error(Y_valid, Y_pred))

In [None]:
from sklearn.linear_model import SGDRegressor

model_SGD = SGDRegressor()
model_SGD.fit(X_train, Y_train)
Y_pred = model_SGD.predict(X_valid)
print(mean_absolute_error(Y_valid, Y_pred))

In [None]:
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBRegressor

model_XGBR = XGBRegressor(learning_rate=0.03,n_estimators=200,objective='reg:squarederror')
model_XGBR.fit(X_train,Y_train)
Y_pred = model_XGBR.predict(X_valid)
print(mean_absolute_error(Y_valid, Y_pred))

In [None]:
plt.figure()
plt.title('Comparison of Sale Price of Predicted and Actual values')
plt.scatter(Y_Train,model_RFR.predict(X_Train),label='Random Forest')
plt.scatter(Y_Train,model_XGBR.predict(X_Train),label='XGB')
plt.legend()

In [None]:
from sklearn.model_selection import GridSearchCV
model = XGBRegressor()

n_estimators   = [100, 200, 500]
learning_rates = [0.03,0.1,0.3]
objectives     = ['reg:squarederror']

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators' : n_estimators,
    'learning_rate':learning_rates,
    'objective' : objectives
    }

grid_cv = GridSearchCV(estimator = model,
            param_grid = hyperparameter_grid,
            scoring = 'neg_mean_absolute_error',
            return_train_score = True)

grid_cv.fit(X_Train,Y_Train)

In [None]:
grid_cv.best_score_


In [None]:
grid_cv.best_estimator_


In [None]:
Y_pred = random_cv.predict(X_valid)
print(mean_absolute_error(Y_valid, Y_pred))

In [None]:
regressor = grid_cv.best_estimator_
Y_pred = regressor.predict(X_valid)
print(mean_absolute_error(Y_valid, Y_pred))

In [None]:
plt.figure()
plt.title('Comparison of Sale Price of Predicted and Actual values')
plt.scatter(Y_Train,model_RFR.predict(X_Train),label='Random Forest')
plt.scatter(Y_Train,model_XGBR.predict(X_Train),label='XGB')
plt.scatter(Y_Train,regressor.predict(X_Train),label='Best model')
plt.legend()

#Preparation of Submission Data


In [None]:
Y_Pred = regressor.predict(X_Test)


In [None]:
Y_Pred


In [None]:
Y_Pred.shape


In [None]:
sub = pd.DataFrame()
sub['Id'] = df_Test['Id']
sub['SalePrice'] = Y_Pred

In [None]:
sub.head()


In [None]:
sub.tail()


In [None]:
sub.to_csv('Submission.csv')
