In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore #for outlier
from scipy import stats # for plot normal distribution
from xgboost import XGBRegressor
import lightgbm
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error
import time
from sklearn import svm

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
## Load data

bronx2020 = pd.read_excel(r"NYC_Housing_Dataset_2020/2020_bronx.xlsx",skiprows=6,thousands=',')
brooklyn2020 = pd.read_excel(r"NYC_Housing_Dataset_2020/2020_brooklyn.xlsx",skiprows=6,thousands=',')
manhattan2020 = pd.read_excel(r"NYC_Housing_Dataset_2020/2020_manhattan.xlsx",skiprows=6,thousands=',')
queens2020 = pd.read_excel(r"NYC_Housing_Dataset_2020/2020_queens.xlsx",skiprows=6,thousands=',')
statenisland2020 = pd.read_excel(r"NYC_Housing_Dataset_2020/2020_staten_island.xlsx",skiprows=6,thousands=',')

# Load data for df price prediction 2021
bronx2021 = pd.read_excel("NYC_Housing_Dataset_2021/2021_bronx.xlsx",skiprows=6)
brooklyn2021 = pd.read_excel("NYC_Housing_Dataset_2021/2021_brooklyn.xlsx",skiprows=6)
manhattan2021 = pd.read_excel("NYC_Housing_Dataset_2021/2021_manhattan.xlsx",skiprows=6)
queens2021 = pd.read_excel("NYC_Housing_Dataset_2021/2021_queens.xlsx",skiprows=6)
statensisland2021 = pd.read_excel("NYC_Housing_Dataset_2021/2021_staten_island.xlsx",skiprows=6)

In [None]:
df = pd.concat([manhattan2020, manhattan2021,bronx2020, bronx2021,brooklyn2020,brooklyn2021, 
                 queens2020,queens2021,statenisland2020,statensisland2021], ignore_index=True)
df.columns = [c.replace('\n', '') for c in df.columns]
df.head(10)

### Data Cleaning and Data Preprocessing 

In [None]:
## Duplicates

In [None]:
#Delete the duplicates and check that it worked
df = df.drop_duplicates(df.columns, keep='last')
sum(df.duplicated(df.columns))

In [None]:
## Uniqueness

In [None]:
#Check for the number of unique values for every column 
for i in df.columns:
  print(str(i) + ":" + str(df[i].nunique()))

In [None]:
## Data type transformation 

Since there is no information in EASE-MENT columns, we will drop it initially. We will also drop the SALE DATE column as we are not considering the effect of time on the sale price. 

In [None]:
#Converting columns to respective datatype
categoricalData = ['BOROUGH','TAX CLASS AT PRESENT','BUILDING CLASS CATEGORY',
                    'TAX CLASS AT TIME OF SALE',
                    'BUILDING CLASS AT PRESENT',
                    'BUILDING CLASSAT TIME OF SALE']
for each in categoricalData:
    df[each] = df[each].astype('category')
df['LAND SQUARE FEET'] = pd.to_numeric(df['LAND SQUARE FEET'], errors='coerce')
df['GROSS SQUARE FEET']= pd.to_numeric(df['GROSS SQUARE FEET'], errors='coerce')
df['SALE PRICE'] = df['SALE PRICE'].replace('[CA\$,]', '', regex=True).astype(float) * 0.77 # convert to US dollars 
df = df.drop(columns=["SALE DATE", "EASE-MENT"])
df.info()

In [None]:
## Missing values

In [None]:
# Drop LAND SQUARE FEET = 0, GROSS SQUARE FEET = 0, YEAR BUILT = 0   
df = df[df['LAND SQUARE FEET'] > 0]
df = df[df['GROSS SQUARE FEET'] > 0]
df = df[df['YEAR BUILT'] > 0]  
#Drop the missing values
df.dropna(how='all',axis=1,inplace=True)
df.info()

In [None]:
##Check whether dropping nan worked 
df.isnull().sum()

In [None]:
## Outliers

In this case, we will use Z score to detect the outliers in the columns of LAND SQUARE FEET, GROSS SQUARE FEET and SALE PRICE. 
Z score is a significant measure that tells how much a number is above or below the mean of the dataset in terms of standard deviation. We set the threshold=3

In [None]:
df['landSquare_zscore']=zscore(df['LAND SQUARE FEET'])
df.loc[df['landSquare_zscore'].abs()<=3]
print("The count of outliers in LAND SQUARE FEET is: {}".format(len((np.where(zscore(df['LAND SQUARE FEET']) > 3)[0]))))

df['grossSquare_zscore']=zscore(df['GROSS SQUARE FEET'])
df.loc[df['grossSquare_zscore'].abs()<=3]
print("The count of outliers in GROSS SQUARE FEET is: {}".format(len((np.where(zscore(df['GROSS SQUARE FEET']) > 3)[0]))))

df['salePrice_zscore']=zscore(df['SALE PRICE'])
df.loc[df['salePrice_zscore'].abs()<=3]
print("The count of outliers in SALE PRICE is: {}".format(len((np.where(zscore(df['SALE PRICE']) > 3)[0]))))

### Exploratory Data Analysis

### Feature Engineering

In [None]:
df['AGE'] = 2021 - df['YEAR BUILT']

# is the building pre or post war?
df['AGE_CATEGORY'] = np.where(df['AGE']>=76,"Pre-War",'Post-War')

df['AGE_CATEGORY'] = df['AGE_CATEGORY'].astype('category')

df.head()

## Converting Borough column to name of each property where it is located.

In [None]:
df['BOROUGH'].unique()

In [None]:
df['BOROUGH'] = df['BOROUGH'].map({1.0:"Manhattan",
                                 2.0:"Bronx",
                                 3.0:"Brooklyn",
                                 4.0:"Queens",
                                 5.0:"Staten Island"})
df['BOROUGH'].unique()

### Target Variable[Sale Price]

##### Visualization of Raw Data

Calculating mean, median and mode for each column in dataframe

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
N = 'Nominal'
O = 'Ordinal'
I = 'Interval'
R = "Ratio"

In [None]:
NOIR_Classification={"BOROUGH":N,
                    "NEIGHBORHOOD":N,
                    "BUILDING CLASS CATEGORY":O,
                    "TAX CLASS AT PRESENT":O,
                    "BLOCK":N,
                    "LOT":N,
                    "BUILDING CLASS AT PRESENT":N,
                    "ADDRESS":N,
                    "APARTMENT NUMBER":N,
                    "ZIP CODE":N,
                    "RESIDENTIALUNITS":N,
                    "COMMERCIALUNITS":N,
                    "TOTAL UNITS":N,
                    "LAND SQUARE FEET":N,
                    "GROSS SQUARE FEET":N,
                    "YEAR BUILT":O,
                    "TAX CLASS AT TIME OF SALE":O,
                    "BUILDING CLASSAT TIME OF SALE":N,
                    "SALE PRICE":N,
                    "SALE DATE":O,
                     'AGE': R,
                     'AGE_CATEGORY': O

                   }

In [None]:
for key,value in NOIR_Classification.items():
    print(f"Column: {key}\t\t\t Classification Type :{value}")

In [None]:
df['SALE PRICE'].describe()

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt.boxplot(df['SALE PRICE'])
plt.title('Sale Price - Box Plot')
plt.subplot(1,2,2)
sns.distplot(df['SALE PRICE'])
plt.title('Sale Price - Density Plot')
# Skewness
print("Skewness: %f" % df['SALE PRICE'].skew())
plt.show()

##### Range set-up & Log-Transformation

In [None]:
# Sale price percentage = $0
len(df.loc[df['SALE PRICE'] == 0 ])/len(df)

In [None]:
# Sale price percentage < $50000
len(df.loc[df['SALE PRICE'] < 50000])/len(df)

In [None]:
# Sale price percentage > 12000000
len(df.loc[df['SALE PRICE'] > 12000000 ])/len(df)

##### Observation: 

It is observed that a lot of sales occur with an absurdly small number: \\$0  most commonly ( 40%  of the sale price =  \\$0 ). 
On the basis of the original data resource, it is noted that these sales are in effect transfers of deeds between parties. For instance, the transfer of ownership of the house from parents to the child after the parents move out for retirement. To handle this situation, a reasonable range for the sale price will be set up. The instances for which the sale price is greater than \\$12M and less than \\$50000 will be removed since it will help eliminate the special cases. Following which, log transformation can be performed. 

In [None]:
#Removing rows where year_built is equal to zero because these rows can be considered as outliers
df = df[df['YEAR BUILT']!=0]

In [None]:
df[df['YEAR BUILT']>1600]

In [None]:
# Create a chart showing how distributed is Year Built
sns.distplot(df[df['YEAR BUILT']>1600]['YEAR BUILT'],bins=50,rug=True,kde=True)
plt.show()

In [None]:
df = df[(df['SALE PRICE'] > 50000) & (df['SALE PRICE'] < 12000000)]
sns.distplot(df['SALE PRICE'])

In [None]:
# log(x) transform

df["SALE PRICE"]=np.log(df["SALE PRICE"])

plt.subplots(figsize=(15,6))
plt.subplot(1,2,1)
sns.distplot(df["SALE PRICE"], fit = stats.norm)
plt.ylabel('Frequency')
(mu, sigma) = stats.norm.fit(df["SALE PRICE"])
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')

plt.subplot(1,2,2)
plt.boxplot(df['SALE PRICE'])
plt.title('Sale Price - Box Plot')

#### Feature Analysis and Selection 

In [None]:
df = df.drop(['salePrice_zscore', 'landSquare_zscore', 'grossSquare_zscore'],axis =1)
df.columns

#### (i) Correlation 

In [None]:
#Colinearity heatmap
corr = df.corr()
plt.figure(figsize=(12,6))
sns.heatmap(corr, annot=True)

#### (ii) Feature Analysis and Selection

a. BOROUGH

Explanation: The name of the borough in which the property is located.

1 = Bronx\
2 = Brooklyn\
3 = Manhattan\
4 = Queens\
5 = Staten Island

In [None]:
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt.title("Number of Observations in each BOROUGH")
df['BOROUGH'].value_counts().plot(kind='bar')

# Average price by borough
plt.subplot(1,2,2)
plt.title("Mean Sale Price in each BOROUGH")
df.groupby('BOROUGH').mean()['SALE PRICE'].plot(kind = 'bar')

In [None]:
plt.figure(figsize=(15,6))
sns.boxplot(x=df['BOROUGH'].astype('category'), y = df['SALE PRICE'], data=pd.melt(df))
plt.show()

In [None]:
df['ZIP CODE'].nunique()

##### Finding top 10 zipcodes with highest sales

#### b. NEIGHBORHOOD

Explanation: While evaluating properties, the name of hte neighborhood is determined by the Department of Finance assessors. Although, the common name of the neighborhood is the same as designated by the Finance department, slight differences may occur when it comes to neighborhood boundary lines. It is also noted that a few sub-neighborhoods might not be included. 

In [None]:
# Best 50 neighborhoods having largest average building sale price
plt.figure(figsize=(30,10))
df.groupby('NEIGHBORHOOD').mean()['SALE PRICE'].sort_values(ascending = False).head(50).plot(kind = 'bar')
plt.title("Best 50 neighborhoods with largest average building sale price")
plt.ylabel('Average sale price')

#### Building Class category

Explanation: To help the users of the Rolling Sales Files to identify similar properties by broad usage such as One Family Homes without having to look up Individual Buildign Classes, this filed is included. The files are sorted by Borough, Neighborhood, Block, Building Class Category and Lot.  

In [None]:
df['BUILDING CLASS CATEGORY'].unique()

In [None]:
# number of buildings by building class category
plt.figure(figsize=(15,10))
df.groupby('BUILDING CLASS CATEGORY').size().plot(kind = 'bar')
plt.title('Number of buildings in each building class categories')
plt.ylabel('Number of buildings')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(20,6))
order = sorted(df['BUILDING CLASS CATEGORY'].unique())
sns.boxplot(x=df['BUILDING CLASS CATEGORY'], y = df['SALE PRICE'], data=df, order=order)
plt.xticks(rotation = 90)
plt.show()

In [None]:
df['TAX CLASS AT TIME OF SALE'].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x=df['TAX CLASS AT TIME OF SALE'], y = df['SALE PRICE'], data=pd.melt(df))
plt.show()

In [None]:
## Block and Lot

In [None]:
plt.figure(figsize=(30,10))
order = df['BUILDING CLASSAT TIME OF SALE'].unique()
sns.boxplot(x='BUILDING CLASSAT TIME OF SALE', y='SALE PRICE', data=df, order = order)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.hist(df['YEAR BUILT'])
plt.title('Number of buildings built in different year in df')
plt.ylabel('Count')
plt.xlabel("Year Built")

In [None]:
plt.figure(figsize=(10,5))
plt.plot(df.groupby('YEAR BUILT').mean()['SALE PRICE'])
plt.plot(df.groupby('YEAR BUILT').median()['SALE PRICE'],color='r')
plt.title('Average and Median Sale price of buildings built in different year in df')
plt.ylabel('Average or Median sale price')
plt.legend(['Average','Median'])

In [None]:
# Year Built
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)

sns.regplot(x='YEAR BUILT', y='SALE PRICE',data=df)
plt.title('Year Built vs. sale price ')
plt.subplot(1,2,2)
sns.boxplot(x= 'YEAR BUILT', data=df)
plt.title('Year Built boxplt')
plt.show()

In [None]:
#density plot
sns.distplot(df['RESIDENTIALUNITS'],fit = stats.norm)
plt.ylabel('Frequency')
plt.xlim([0,250])

In [None]:
df['LAND SQUARE FEET']=np.log(df['LAND SQUARE FEET'])
df['GROSS SQUARE FEET']=np.log(df['GROSS SQUARE FEET'])
plt.figure(figsize=(15,6));
a=sns.jointplot(x="LAND SQUARE FEET", y="SALE PRICE", data=df,
                  kind="reg", truncate=False,
                  color="m", height=7);
b=sns.jointplot(x="GROSS SQUARE FEET", y="SALE PRICE", data=df,
                  kind="reg", truncate=False,
                  color="m", height=7);

### 3.3 Summary

Based on previous data visulization and analysis, we will drop these features: 

NEIGHBORHOOD, ADDRESS, APARTMENT NUMBER, BLOCK, LOT, ZIP CODE, BLOCK, TAX CLASS AS OF FINAL ROLL 18/19, 
BUILDING AS OF FINAL ROLL 18/19 

In [None]:
df = df.drop(columns=["ADDRESS", "APARTMENT NUMBER", "LOT", "ZIP CODE", "BLOCK", 
                        "NEIGHBORHOOD", "TAX CLASS AT TIME OF SALE",
                        "BUILDING CLASSAT TIME OF SALE"])

In [None]:
df.info()

In [None]:
categoricals= ['BOROUGH', 'BUILDING CLASS AT PRESENT','TAX CLASS AT PRESENT',
               'BUILDING CLASS CATEGORY']
one_hot_encoded = pd.get_dummies(df[categoricals])
df_new = df.drop(categoricals,axis = 1)
df_new = df_new.join(one_hot_encoded).reset_index()
df.info()

In [None]:
df_new

In [None]:
#density plot
sns.distplot(df_new['LAND SQUARE FEET'],fit = stats.norm)
plt.ylabel('Frequency')
plt.title("log(LAND SQUARE FEET) density plot")

In [None]:
#density plot
sns.distplot(df_new['GROSS SQUARE FEET'],fit = stats.norm)
plt.ylabel('Frequency')
plt.title("log(GROSS SQUARE FEET) density plot")

In [None]:
df.columns

In [None]:
df_new.columns

In [None]:
df_new.drop('AGE_CATEGORY',axis='columns',inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

X = df_new.drop(['SALE PRICE'], axis = 1)
y = df_new['SALE PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
print("The R2 score of linear regression model is ", r2_score(y_test,y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# RANSAC Regression

In [None]:
from sklearn.linear_model import RANSACRegressor
import matplotlib.pyplot as plt

model = RANSACRegressor(max_trials=1000)
model.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

print("The R2 score of RANSAC Regression model is ", r2_score(y_test, test_pred))
print('MSE:', metrics.mean_squared_error(y_test, test_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, test_pred)))
      
plt.scatter(y_test, test_pred)

plt.title('Residual plot of RANSAC Regression')

# Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha = 0.1)
lasso_reg.fit(X_train, y_train)
lasso_reg_predict = lasso_reg.predict(X_test)

print("The R2 score of Lasso Regression model is ", r2_score(y_test, lasso_reg_predict))
print('MSE:', metrics.mean_squared_error(y_test, lasso_reg_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, lasso_reg_predict)))

plt.title('Residual plot of Lasso Regression')

plt.scatter(y_test, lasso_reg_predict)

# Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha = 0.1)
ridge_reg.fit(X_train, y_train)
ridge_reg_predict = ridge_reg.predict(X_test)

print("The R2 score of Ridge Regression model is ", r2_score(y_test, ridge_reg_predict))
print('MSE:', metrics.mean_squared_error(y_test, ridge_reg_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, ridge_reg_predict)))

plt.scatter(y_test, ridge_reg_predict)
plt.title('Residual plot of Ridge Regression')

# ElasticNet Model

In [None]:
from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=0.1, l1_ratio=0.9, selection='random', random_state=42)
model.fit(X_train, y_train)
elastic_predict = model.predict(X_test)

print("The R2 score of ElasticNet model is ", r2_score(y_test, elastic_predict))
print('MSE:', metrics.mean_squared_error(y_test, elastic_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, elastic_predict)))

plt.scatter(y_test, elastic_predict)
plt.title('Residual plot of Elastic Net')

# XGBoost Regression

# Light GBM Regression - Gradient Boosting Machine

In [None]:
import lightgbm
import time
lgbm = lightgbm.LGBMRegressor(random_state = 42)
start = time.process_time()
model = lgbm.fit(X_train,y_train)
lgbm_predict =  model.predict(X_test)

print("The R2 score of Light GBM Regression model is ", r2_score(y_test, lgbm_predict))
print('MSE:', metrics.mean_squared_error(y_test, lgbm_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, lgbm_predict)))

plt.scatter(y_test, lgbm_predict)
plt.title('Residual plot of LGBM')

# Feature Importance of the Ridge Model

In [None]:
ridge_result = pd.DataFrame({'feature': X_train.columns, 'importance': abs(ridge_reg.coef_)})
ridge_result_sorted = ridge_result.sort_values(by = 'importance', ascending = True)

fig, ax1 = plt.subplots(figsize = (13,16))
ax1.barh(ridge_result_sorted['feature'], ridge_result_sorted['importance'])
ax1.set_ylabel('feature importance')
ax1.set_title('Feature importance of Ridge')
plt.show()

# Deep Neural Multilayer Perceptron - MLP Regression

In [None]:
mlp = MLPRegressor(random_state = 42)
start = time.process_time()
model = mlp.fit(X_train,y_train)
mlp_predict =  model.predict(X_test)

print("The R2 score of MLP model is ", r2_score(y_test, mlp_predict))
print('MSE:', metrics.mean_squared_error(y_test, mlp_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, mlp_predict)))

plt.scatter(y_test, mlp_predict)
plt.title('Residual plot of MLP')

# K Neighbors Regression

In [None]:
knn = KNeighborsRegressor()
start = time.process_time()
model = knn.fit(X_train,y_train)
knn_predict =  model.predict(X_test)

print("The R2 score of KNN model is ", r2_score(y_test, knn_predict))
print('MSE:', metrics.mean_squared_error(y_test, knn_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, knn_predict)))

plt.scatter(y_test, knn_predict)
plt.title('Residual plot of KNN')


# Decision Tree Regression

In [None]:
dtree = DecisionTreeRegressor(random_state = 42)
start = time.process_time()
model = dtree.fit(X_train,y_train)
dtree_predict =  model.predict(X_test)

print("The R2 score of Decision Tree Regression model is ", r2_score(y_test, dtree_predict))
print('MSE:', metrics.mean_squared_error(y_test, dtree_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, dtree_predict)))

plt.scatter(y_test, dtree_predict)
plt.title('Residual plot of Decision Tree')

# Variable Importances

In [None]:
rankings = model.feature_importances_.tolist()
importance = pd.DataFrame(sorted(zip(X_train.columns,rankings),reverse=True),columns=["variable","importance"]).sort_values("importance",ascending = False)
plt.figure(figsize=(15,10))
sns.barplot(x="importance",
            y="variable",
            data=importance)
plt.title('Variable Importances')
plt.tight_layout()


# Random Forest Regression

In [None]:
model = RandomForestRegressor(n_estimators=10)
model.fit(X_train.astype('int'),y_train.astype('int'))
rand_forest_predict = model.predict(X_test)

print("The R2 score of Random Forest Regression model is ", r2_score(y_test, rand_forest_predict))
print('MSE:', metrics.mean_squared_error(y_test, rand_forest_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rand_forest_predict)))

plt.scatter(y_test, rand_forest_predict)
plt.title('Residual plot of Random Forest')


# Cat boost

In [None]:
modelcat = cb.CatBoostRegressor(loss_function='RMSE')
modelcat.fit(X_train,y_train)
cat_predict = modelcat.predict(X_test)

print("The R2 score of Cat Boost model is ", r2_score(y_test, cat_predict))
print('MSE:', metrics.mean_squared_error(y_test, cat_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, cat_predict)))

plt.scatter(y_test, cat_predict)
plt.title('Residual plot of Cat Boost')

In [None]:
mtr = df_new.corr()
# Generating a mask for the upper triangle
mask = np.zeros_like(mtr)
mask[np.triu_indices_from(mask)] = True
# Setting up the matplotlib figure
fig, ax = plt.subplots(figsize=(16,12))
plt.suptitle("Correlation Matrix of Columns in Heatmap", size=24)
# Draw the heatmap with the mask and annotations
sns.heatmap(mtr, mask= mask, annot=True, annot_kws={"size": 10});

# Feature Importance of the Best Model: Cat boost

In [None]:
import 
feature_importance = modelcat.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 15))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X_test.columns)[sorted_idx])
plt.title('Feature Importance')