# Load the data

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
train_data_orig = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
train_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")


In [2]:
train_data.shape

# Handling Missing Data

## Find number of missing values for each column

In [3]:
train_data.isnull().sum()

In [4]:
# .info() also gives info about the missing values
train_data.info()

In [5]:
# Drop columns with more than 50% rows of missing value, as imputating will be less accurate
drop_cols = [ind for ind, val in  train_data.isnull().sum().iteritems() if val > 730]
train_data.drop(drop_cols, inplace=True, axis=1)

In [6]:
# Also drop id column
train_data.drop('Id', inplace=True, axis=1)

In [7]:
train_data.info()

In [8]:
# Features which require missing value imputation
missing_val_feat = [ind for ind, val in  train_data.isnull().sum().iteritems() if val > 0]

In [9]:
missing_val_feat

In [10]:
train_data.shape

In [11]:
train_data.columns

In [12]:
# If a feature has relatively few missing values, those rows can be dropped
drop_rows_for_these_cols = [i for i in missing_val_feat if train_data[i].isnull().sum() < 60]

In [13]:
drop_rows_for_these_cols

In [14]:
# drop rows with missing values wrt these columns
train_data.dropna( how='any',subset=drop_rows_for_these_cols, inplace=True)

In [15]:
train_data.shape

In [16]:
# Again check the features which require missing value imputation
missing_val_feat = [ind for ind, val in  train_data.isnull().sum().iteritems() if val > 0]

In [17]:
train_data[missing_val_feat].info()

In [18]:
# Fill missing data with median (for numeric features) or mode (for categorical features)
def impute_missing_val(data):
    for col, dtype in data.dtypes.items():
        if(str(dtype) in ['float64', 'int64']):
            data[col].fillna(train_data_orig[col].median(), inplace = True)
        else:
            data[col].fillna(train_data_orig[col].mode()[0], inplace = True)
    return data

In [19]:
train_data = impute_missing_val(train_data)

In [20]:
# Check if all missing values have been imputed
train_data.isnull().sum().sum()

In [21]:
train_data

# Baseline Model


In [22]:
# Encode categorical features
y = train_data.pop('SalePrice')
X = train_data.copy(deep=True)

In [23]:
# LabelEncoding
from sklearn import preprocessing
datatypes = X.dtypes
encodings = {}

for col, dt in datatypes.iteritems():
    if(str(dt) not in ['float64', 'int64']):
        
        #print(col, dt)
        X[col] = X[col].astype("category")
        encodings[col] = X[col].cat.codes
        X[col] = encodings[col]
        

In [24]:
'''datatypes = X.dtypes
for col, dt in datatypes.iteritems():
    if(str(dt) not in ['float64', 'int64']):
        X[col] = X[col].astype("category")'''

In [25]:
X.info()

In [26]:
'''# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(X)
len(enc.categories_)'''

In [54]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)


# Train ML Models

In [55]:
# Train XGBoost

from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.01)
xgb.fit(X_train,y_train)
predictions = xgb.predict(X_val)

rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)


In [56]:
r_squared = r2_score(predictions, y_val)
r_squared

# Cross-validation

In [51]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(
        xgb, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )

In [52]:
score = [np.sqrt(-1 * i) for i in score]

In [30]:
'''from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score 

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
predictions = lin_reg.predict(X_val)

r_squared = r2_score(predictions, y_val)

print("R2 Score:", r_squared)
rmsle = np.sqrt(mean_squared_log_error(predictions, y_val))
print("RMSLE:", rmsle)'''


# Testing

In [31]:
test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
X_test = test_data.copy(deep=True)
X_test.drop(drop_cols, inplace=True, axis=1)
X_test.drop('Id', inplace=True, axis=1)

In [32]:
X_test

In [33]:
X_test.isnull().sum().sum()

In [34]:
X_test = impute_missing_val(X_test)


In [35]:
X_test.isnull().sum().sum()

## Encode categorical similar to train

In [36]:

for col in encodings:
    X_test[col] = encodings[col]

In [37]:
'''
# Encode categorical similar to train
X_test = pd.get_dummies(X_test)
# Add missed columns missed due to get dummies on X_test
X_test = X_test.reindex(columns = X_train.columns, fill_value=0)
'''

In [38]:
X_test

In [39]:
preds = xgb.predict(X_test)
submit = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': preds})
submit.to_csv('submission.csv',index=False)