In [1]:
'''
Main module for preprocessing.
'''
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import RobustScaler

PATH = './data/train.csv'

def encoding_values(df):
    '''
    encode categorical data to float
    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''
    df_objects = (df.dtypes=='object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def impute_values(df):
    '''
    Impute missing values in features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''
    imputer = KNNImputer(n_neighbors=20)
    imputer.fit(df)
    imputed_df = pd.DataFrame(imputer.transform(df))
    imputed_df.columns = df.columns
    return imputed_df

def scaling_values(df):
    '''
    Scaling features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    scaler = RobustScaler()
    if 'SalePrice' in df:
        x_train = df.drop(['Id', 'SalePrice'], axis=1)
    else:
        x_train = df.drop(['Id'], axis=1)
    scaler.fit(x_train)
    scaled_data = scaler.transform(x_train)
    scaled_data = pd.DataFrame(scaled_data, columns=x_train.columns)
    scaled_data.insert(loc=0, column='Id', value=df['Id'])
    if 'SalePrice' in df:
        scaled_data['SalePrice'] = df['SalePrice']
    return scaled_data

def apply_preprocessing(df):
    '''
    Applying data cleaning functions to data sets
    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Retruns:
        pandas.DataFrame
    '''
    # since some columns have to much NaN we will drop them here
    df = df.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])
    df = encoding_values(df)
    df = impute_values(df)
    df = scaling_values(df)
    return df

def get_df():
    '''
    Sharing dataframe after aplying preprocessing
    Returns:
        pandas.DataFrame
    '''
    df = pd.read_csv(PATH)
    df = apply_preprocessing(df)
    return df


In [2]:
PATH = '../data/train.csv'
TEST_PATH = '../data/test.csv'
df = get_df()
test_df = pd.read_csv(TEST_PATH)

In [3]:
test_df = apply_preprocessing(test_df)
test_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461.0,-0.6,-1.0,0.55,0.538713,0.0,0.0,0.0,0.0,0.0,...,-0.388889,0.0,0.0,120.0,0.0,0.0,0.00,1.0,0.0,0.0
1,1462.0,-0.6,0.0,0.60,1.179692,0.0,-1.0,0.0,0.0,-2.0,...,0.111111,0.0,0.0,0.0,0.0,12500.0,0.00,1.0,0.0,0.0
2,1463.0,0.2,0.0,0.25,1.073791,0.0,-1.0,0.0,0.0,0.0,...,0.083333,0.0,0.0,0.0,0.0,0.0,-0.75,1.0,0.0,0.0
3,1464.0,0.2,0.0,0.45,0.140313,0.0,-1.0,0.0,0.0,0.0,...,0.111111,0.0,0.0,0.0,0.0,0.0,0.00,1.0,0.0,0.0
4,1465.0,1.4,0.0,-1.30,-1.064825,0.0,-1.0,-2.0,0.0,0.0,...,0.750000,0.0,0.0,144.0,0.0,0.0,-1.25,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915.0,2.2,1.0,-2.40,-1.808554,0.0,0.0,0.0,0.0,0.0,...,-0.388889,0.0,0.0,0.0,0.0,0.0,0.00,-1.0,0.0,0.0
1455,2916.0,2.2,1.0,-2.40,-1.818733,0.0,0.0,0.0,0.0,0.0,...,-0.055556,0.0,0.0,0.0,0.0,0.0,-0.50,-1.0,0.0,-4.0
1456,2917.0,-0.6,0.0,4.55,2.569005,0.0,0.0,0.0,0.0,0.0,...,-0.388889,0.0,0.0,0.0,0.0,0.0,0.75,-1.0,0.0,-4.0
1457,2918.0,0.7,0.0,-0.35,0.252514,0.0,0.0,0.0,0.0,0.0,...,0.055556,0.0,0.0,0.0,0.0,700.0,0.25,-1.0,0.0,0.0


In [4]:
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_squared_error
    from catboost import CatBoostRegressor
    import numpy as np
    
    X = df.drop(['SalePrice', 'Id'], axis=1)
    y = df['SalePrice']

    k_fold = KFold(
    n_splits=5,
    shuffle=True,
    random_state=42
    )

    scores = []
    for train_index, test_index in k_fold.split(X):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]

        reg = CatBoostRegressor(
            depth=7,
            iterations=1500,
            learning_rate = 0.025,
            logging_level='Silent'
            )

        reg.fit(X_train, y_train)

        y_predict = reg.predict(X_test)

        acc_score = np.sqrt(mean_squared_error(y_test, y_predict))

        print(acc_score)

        scores.append(acc_score)

    print()
    print("Average:", np.sum(scores)/5)
    print("Std:", round(np.std(scores), 3))

27178.697910903953
24220.69451497685
36252.46443754329
27096.6612506494
19359.727642258822

Average: 26821.649151266465
Std: 5506.153


In [5]:
test_pred = reg.predict(test_df)
test_id = test_df['Id']

In [6]:
submission = pd.DataFrame(test_id, columns = ['Id'])
submission['Id'] = submission['Id'].astype('int')

In [7]:
submission

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465
...,...
1454,2915
1455,2916
1456,2917
1457,2918


In [8]:
submission['SalePrice'] = test_pred
submission

Unnamed: 0,Id,SalePrice
0,1461,128422.879682
1,1462,172057.447897
2,1463,196860.726597
3,1464,193478.869154
4,1465,191579.444528
...,...,...
1454,2915,101613.114631
1455,2916,86898.413709
1456,2917,172588.802274
1457,2918,132174.988837


In [14]:
#submission.to_csv("submission.csv", index = False, header = True)

## kaggle scopre 0.15282 from previous 0.17896

In [10]:
import numpy as np
test_data = {'Id': [1, 2, 3, 4, 5],
                'Alley': ['null', 'Pave', 'Grvl', 'Pave', 'Grvl'],
                'PoolQC': ['null', 'Ex', 'Gd', 'TA', 'Ex'],
                'Fence': ['null', 'MnWw', 'GdWo', 'MnWw', 'GdWo'],
                'MiscFeature': ['null', 'Elev', 'Gar2', 'Shed', 'Elev'],
                'test': [1,np.nan,3,np.nan,5],
                'category': ['a','b',np.nan,'d','e'],
                'SalePrice': [12334, 200000, 43222, 124124, 123442]}

In [11]:
test_df = pd.DataFrame(test_data)
test_df = apply_preprocessing(test_df)
test_df = test_df.round(decimals=5)
test_df

Unnamed: 0,Id,test,category,SalePrice
0,1.0,-2.0,-1.5,12334.0
1,2.0,0.0,-0.5,200000.0
2,3.0,0.0,0.0,43222.0
3,4.0,0.0,0.5,124124.0
4,5.0,2.0,1.5,123442.0


In [12]:
    expected_df = pd.DataFrame({'Id': [1.0, 2.0, 3.0, 4.0, 5.0],
                        'test': [-2.0, 0.0, 0.0, 0.0, 2.0],
                        'category': [-1.5, 0.33333, 0.50000, 0.66667, 1.00000],
                        'SalePrice': [12334.0, 200000.0, 43222.0, 124124.0, 123442.0]})

In [13]:
test_df.equals(expected_df)

False