In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from scipy.stats import skew
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
%matplotlib inline

In [2]:
np.random.seed(21)

In [3]:
train = pd.read_csv("transform/train_transform_after_cleaning.csv")
test = pd.read_csv("transform/test_transform_after_cleaning.csv")

In [4]:
train.shape

(1456, 162)

In [5]:
test.shape

(1459, 161)

# Проекция числовых на категории

In [6]:
def do_proj_num(allData):
    allData = allData.replace({"MSSubClass" : {20 : "SC20", 30 : "SC30", 40 : "SC40", 45 : "SC45", 
                                           50 : "SC50", 60 : "SC60", 70 : "SC70", 75 : "SC75", 
                                           80 : "SC80", 85 : "SC85", 90 : "SC90", 120 : "SC120", 
                                           150 : "SC150", 160 : "SC160", 180 : "SC180", 190 : "SC190"},
                           "MoSold" : {1 : "Jan", 2 : "Feb", 3 : "Mar", 4 : "Apr", 5 : "May", 6 : "Jun",
                                       7 : "Jul", 8 : "Aug", 9 : "Sep", 10 : "Oct", 11 : "Nov", 12 : "Dec"}
                          })
    allData['MSSubClass'] = allData['MSSubClass'].astype("object")
    allData['MoSold'] = allData['MoSold'].astype("object")
    return allData

# One Hot Encoding

In [7]:
def one_hot_encode(df):
    categorical_cols = df.select_dtypes(include=['object']).columns

    print(len(categorical_cols), "categorical columns")
    print(categorical_cols)

    dummies = pd.get_dummies(df[categorical_cols], columns = categorical_cols).columns
    df = pd.get_dummies(df, columns = categorical_cols)

    print("Total Columns:",len(df.columns))
    print(df.info())
    
    return df, dummies

# Label  Encoding

In [8]:
def do_encode_label(y):
    le = preprocessing.LabelEncoder()
    return le.fit_transform(y)

# Frequency Encoding

In [9]:
def do_cat_freq(allData):
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        count_map_col = allData[col].value_counts(dropna=False).to_dict()
        alldata[col] = allData[col].map(count_map_col)
    return allData

# Бининг

KBinsDiscretizer 

Feature Binning: Conversion of a continuous variable to categorical.

In [10]:
def neighborHoodScore(df,npdict=None):
    if npdict is None:
        nprice = df.groupby("Neighborhood")['SalePrice'].mean()
        npdict = {}
        for neigh in nprice.index:
            loadval=0
            if 100000<nprice[neigh]<=139000:
                loadval=1
            elif 139000<nprice[neigh]<=199000:
                loadval=2
            elif 199000<nprice[neigh]<=250000:
                loadval=3
            elif nprice[neigh]>250000:
                loadval=4
            npdict[neigh]=loadval
    df["Nval"] = df["Neighborhood"].map(npdict)
    return df,npdict

# Трансформация числовых

In [11]:
def transformContinuous(alldata, skew_index):
    #alldata = alldata.apply(pd.to_numeric)
    for sfvar in skew_index:
        alldata[sfvar]=np.log(alldata[sfvar]+1.)
    return alldata

# Масштабирование

In [12]:
#scaler = MinMaxScaler()
#scaler = RobustScaler()
#scaler = StandardScaler()

In [13]:
def scaleData(trainData, testData, trainCols, testCols, scaler):
    trainData = pd.DataFrame(scaler.fit_transform(trainData[trainCols]), columns= trainCols)
    #trainData = pd.concat([trainData, pd.DataFrame(y)], axis=1)
    testData = pd.DataFrame(scaler.fit_transform(testData[testCols]), columns= testCols)
    return trainData, testData

# Удаление малополезных признаков

In [14]:
def dropZeros(alldata):
    drop=[]
    for c in alldata.columns:
        if len(alldata[c].value_counts(dropna=False))==1:
            drop.append(c)
    return alldata,drop

# Выбор функций

In [15]:
train.shape

(1456, 162)

In [16]:
test.shape

(1459, 161)

In [17]:
train_transform = copy.deepcopy(train)
test_transform = copy.deepcopy(test)

### Проекция числовых на категории

In [18]:
train_transform = do_proj_num(train_transform)
test_transform = do_proj_num(test_transform)

### Трансформация числовых

In [19]:
categorical_features = train_transform.select_dtypes(include = ["object"]).columns
numerical_features = train_transform.select_dtypes(exclude = ["object"]).columns
numerical_features = numerical_features.drop("SalePrice", "Id")

In [20]:
# Find skewed numerical features
skew_features = train_transform[numerical_features].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head(10)

There are 69 numerical features with Skew > 0.5 :


SimplPoolQC       26.299727
MiscVal           24.418175
SimplPoolScore    23.689546
PoolScore         19.568931
PoolQCnum         19.510633
PoolArea          17.504556
LotArea           12.574590
3SsnPorch         10.279262
LowQualFinSF       8.989291
TotalBsmtSF-3      5.822403
dtype: float64

In [21]:
train_transform = transformContinuous(train_transform, skew_index)
test_transform = transformContinuous(test_transform, skew_index)

### Бининг

In [22]:
train_transform, npdict = neighborHoodScore(train_transform)
test_transform = neighborHoodScore(test_transform, npdict)

### Кодирование категорий

In [23]:
train_transform, _ = one_hot_encode(train)
test_transform, _ = one_hot_encode(test)

43 categorical columns
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')
Total Columns: 383
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1456 entries, 0 to 1455
Columns: 383 entries, Id to SaleCondition_Partial
dtypes: float64(18), int64(101), uint8(264)
memory usage: 1.7 MB
None
43 categorical columns
Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities'

### Масштабирование

In [24]:
blocklist = list(train_transform.columns[(train_transform.sum()==2)])+list(train_transform.columns[(train_transform.sum()==0)])+list(train_transform.columns[(train_transform.sum()==1)])
scaleTrainVars=[x for x in train_transform.columns if x!='SalePrice' and x!='Id']
scaleTestvars=[x for x in test_transform.columns if x!='Id']
trainvars=[x for x in train_transform.columns if x!='SalePrice']
trainvars = [x for x in trainvars if x not in blocklist]
trainalldrop = []

In [25]:
#train_transform[scaleTrainVars], test_transform[scaleTestvars] = scaleData(train_transform, test_transform, scaleTrainVars, scaleTestvars, scaler)

In [26]:
train_transform['Id']

0          1
1          2
2          3
3          4
4          5
        ... 
1451    1456
1452    1457
1453    1458
1454    1459
1455    1460
Name: Id, Length: 1456, dtype: int64

### Удаление малополезных признаков

In [27]:
for c in trainvars:
    if c not in test_transform.columns:
        if c!='SalePrice':
            trainalldrop.append(c)
            trainvars.remove(c)

In [28]:
trainalldrop

['HouseStyle_2.5Fin', 'GarageQual_Ex']

In [29]:
train_transform, dropTrain = dropZeros(train_transform)
test_transform, dropTest = dropZeros(test_transform)

In [30]:
test_transform = test_transform[trainvars]

In [31]:
trainvars.append('SalePrice')
train_transform = train_transform[trainvars]

### Сохранение результатов

In [32]:
train_transform.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,1,60,65.0,8450,7,5,7,7,196.0,706,...,0,0,1,0,0,0,0,1,0,208500
1,2,20,80.0,9600,6,8,34,34,0.0,978,...,0,0,1,0,0,0,0,1,0,181500
2,3,60,68.0,11250,7,5,9,8,162.0,486,...,0,0,1,0,0,0,0,1,0,223500
3,4,70,60.0,9550,7,5,95,40,0.0,216,...,0,0,1,1,0,0,0,0,0,140000
4,5,60,84.0,14260,8,5,10,10,350.0,655,...,0,0,1,0,0,0,0,1,0,250000


In [33]:
test_transform.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,49,49,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,52,52,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,13,12,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,12,12,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,18,18,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [34]:
train_transform.shape

(1456, 346)

In [35]:
test_transform.shape

(1459, 345)

In [36]:
#train_transform.to_csv("transform/train_transform_after_preparation.csv", index = False)
#test_transform.to_csv("transform/test_transform_after_preparation.csv", index = False)

In [37]:
train_transform.to_feather("transform/train_transform_after_preparation.feather")
test_transform.to_feather("transform/test_transform_after_preparation.feather")