In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## for feature slection

from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
dataset = pd.read_csv('dataset_engg.csv')

## Splitting Data into test and train ds

In [3]:
all_features=[feature for feature in dataset.columns if feature not in ['Id','SalePrice']]

In [4]:
X = dataset[all_features].values
y = dataset['SalePrice'].values

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 32)

## Perform Feature Scaling

In [6]:
y_train = y_train.reshape(len(y_train), 1)
y_test = y_test.reshape(len(y_test), 1)

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler_X=MinMaxScaler()
scaler_X.fit(X_train)
scaler_y=MinMaxScaler()
scaler_y.fit(y_train)

MinMaxScaler()

In [8]:
X_train = scaler_X.transform(X_train)
X_test = scaler_X.transform(X_test)
y_train = scaler_y.transform(y_train)
y_test = scaler_y.transform(y_test)

In [9]:
y_train = y_train.flatten()
y_test = y_test.flatten()

In [10]:
print(X_train.shape)

(1168, 82)


## Perform Feature Selection

In [11]:
### Apply Feature Selection
# first, I specify the Lasso Regression model, and I
# select a suitable alpha (equivalent of penalty).
# The bigger the alpha the less features that will be selected.

# Then I use the selectFromModel object from sklearn, which
# will select the features which coefficients are non-zero

feature_sel_model = SelectFromModel(Lasso(alpha=0.0001, random_state=32)) # remember to set the seed, the random state in this function
feature_sel_model.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.0001, random_state=32))

In [12]:
# from sklearn.svm import SVR
# from sklearn.model_selection import GridSearchCV
# alpha = [i*0.0001 for i in range(1,25)]
# tuned_parameters = [{'alpha': alpha}]
# grid_search = GridSearchCV(estimator = Lasso(),
#                            param_grid = tuned_parameters,
#                            cv = 10,
#                            n_jobs = -1)
# grid_search = grid_search.fit(X_train, y_train)
# best_score = grid_search.best_score_
# best_parameters = grid_search.best_params_
# print("Best Score: {:.2f} %".format(best_score*100))
# print("Best Parameters:", best_parameters)

In [13]:
selected_columns = feature_sel_model.get_support()

In [14]:
def get_updated_array(X_train, all_features, selected_columns):
    df = pd.DataFrame(X_train,columns = all_features)
    selected_feat = df.columns[(selected_columns)]
    return df[selected_feat]

In [15]:
train_df_upd = get_updated_array(X_train, all_features, selected_columns)
X_train = train_df_upd.values

In [21]:
len(train_df_upd.columns)

58

In [16]:
test_df_upd = get_updated_array(X_test, all_features, selected_columns)
X_test = test_df_upd.values

## Save data

In [17]:
train_df_upd['SalePrice'] = y_train
test_df_upd['SalePrice'] = y_test

In [18]:
test_df_upd.head()

Unnamed: 0,MSSubClass,MSZoning,LotArea,LandContour,LotConfig,Neighborhood,Condition1,Condition2,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtUnfSF,Heating,HeatingQC,CentralAir,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,FullBath,HalfBath,KitchenAbvGr,KitchenQual,Functional,Fireplaces,FireplaceQu,GarageYrBlt,GarageFinish,GarageCars,GarageQual,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,PoolQC,MiscFeature,YrSold,SaleType,SaleCondition,GarageYrBltnan,SalePrice
0,0.823529,0.25,0.052965,0.333333,0.0,0.090909,0.4,1.0,1.0,0.555556,0.875,0.272059,0.033333,0.0,0.0,0.6,0.5,0.5,0.333333,1.0,0.5,0.5,0.75,0.25,0.666667,0.028703,0.833333,0.137414,1.0,0.75,1.0,0.103018,0.244068,0.0,0.317888,0.0,0.333333,0.5,0.333333,0.666667,1.0,0.0,0.2,0.345794,0.333333,0.25,0.666667,0.291715,0.0,0.0,0.0,0.0,1.0,0.75,0.666667,0.75,0.0,0.393953
1,0.0,0.75,0.665913,0.333333,1.0,0.727273,0.4,1.0,0.6,0.555556,0.25,0.367647,0.533333,1.0,0.0,0.2,0.6,0.75,0.666667,1.0,1.0,0.5,0.75,0.25,0.166667,0.215982,0.833333,0.349315,1.0,0.5,1.0,0.753982,0.0,0.0,0.683867,0.333333,1.0,0.0,0.666667,0.333333,0.0,0.666667,0.6,0.299065,0.666667,0.5,0.666667,0.0,0.0,0.362319,0.0,0.0,1.0,0.25,0.666667,0.25,0.0,0.567257
2,0.588235,0.75,0.234455,0.333333,0.0,0.681818,0.4,1.0,0.6,0.666667,0.5,0.0,0.0,1.0,0.0,1.0,1.0,0.5,0.666667,1.0,1.0,0.75,0.75,0.75,1.0,0.189582,0.833333,0.122432,1.0,1.0,1.0,0.550091,0.0,0.0,0.481655,0.333333,0.666667,0.0,0.333333,0.666667,1.0,0.333333,0.8,0.0,1.0,0.75,0.666667,0.187865,0.036563,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.634602
3,0.235294,0.75,0.489284,0.333333,0.0,0.954545,0.4,1.0,1.0,0.888889,0.5,0.007353,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.75,1.0,0.833333,0.0,0.833333,0.742295,1.0,1.0,1.0,0.607283,0.526877,0.0,0.72893,0.0,1.0,0.5,0.333333,1.0,1.0,0.333333,0.8,0.0,0.666667,0.75,0.666667,0.060677,0.310786,0.0,0.4,0.0,1.0,0.75,1.0,1.0,0.0,0.915545
4,0.235294,0.75,0.411848,0.333333,1.0,0.636364,0.4,1.0,1.0,0.666667,0.625,0.051471,0.116667,0.0,0.0,1.0,1.0,0.5,0.666667,1.0,1.0,0.75,0.75,0.25,1.0,0.085578,0.833333,0.196062,1.0,1.0,1.0,0.366135,0.430024,0.0,0.559246,0.333333,0.666667,0.5,0.333333,0.666667,1.0,0.333333,0.6,0.065421,0.666667,0.5,0.666667,0.224037,0.071298,0.0,0.0,0.0,1.0,1.0,0.666667,0.75,0.0,0.621666


In [19]:
train_df_upd.to_csv('train_df_upd.csv',index=False)
test_df_upd.to_csv('test_df_upd.csv',index=False)