In [68]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [69]:
#rapids libraries
import cupy as np
import cudf as pd
import cuml

In [70]:
data_path = '../input/house-prices-advanced-regression-techniques/'
df = pd.read_csv(data_path+'train.csv')

In [71]:
discrete = []
for col in df.columns:
    if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
        discrete.append(col)
        
df2 = df[discrete]
df2.info()

In [72]:
#using median imputation to fill NA values
df2.fillna(df2.median(), inplace=True)

In [73]:
X,Y = df2.iloc[0:, 1:-1], df2.iloc[0:,-1]
print(X)

In [74]:
#finding highly skewed features
skewed_features = [col for col in X.columns if abs(X[col].skew())>0.5]
#applying log transform on skewed features 
for feature in skewed_features:
    X[feature] = np.log1p(X[feature])

In [75]:
from cuml.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X)
X_scaled = ss.transform(X)
X_scaled

In [76]:
#splitting into training and testing data
from cuml.preprocessing import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=3)

In [77]:
from cuml.metrics.regression import r2_score, mean_absolute_error, mean_squared_error
def getLinearRegressionMetrics(model, X_test, y_test):
    Y_pred = model.predict(X_test)
    Y_t = y_test.astype('float64')
    r2, mae, mse = r2_score(Y_t, Y_pred), mean_absolute_error(Y_t, Y_pred), mean_squared_error(Y_t, Y_pred)
    print('R2:', r2)
    print('MAE:', mae)    
    print('MSE:', mse)
    return [r2, mae, mse]

In [78]:
from cuml.linear_model import LinearRegression
algorithms = ["svd","eig",'qr',"svd-qr","svd-jacobi"]
models = list()
metrics = list()
for algo in algorithms:
    lr_model = LinearRegression(algorithm=algo)
    %time
    lr_model.fit(X_train,Y_train)
    models.append(lr_model)
    metrics.append(getLinearRegressionMetrics(lr_model,X_test,Y_test))

In [79]:
#Creating table for all algorithms (svd,eig,qr,svd-qr,svd-jacobi)
linear_stats = pd.DataFrame(metrics, columns=['R2', 'MAE', 'MSE'], index=algorithms)
print(linear_stats)

In [80]:
from cuml.decomposition import PCA
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)
print(X_pca.shape)
ss.fit(X_pca)
X_pca = ss.transform(X_pca)
print(X_pca)

In [81]:
X_pca_train, X_pca_test, Y_pca_train, Y_pca_test = train_test_split(X_pca, Y, test_size=0.2, random_state=3)

In [82]:
#Running Ridge Function on svd and eig vectors
from cuml.linear_model import Ridge
algorithms = ["svd", "eig"]
pca_models = []
pca_metrics = []
for algo in algorithms:
    lr_model = Ridge(alpha=1, solver=algo)
    %time lr_model.fit(X_pca_train, Y_pca_train)
    pca_models.append(lr_model)
    pca_metrics.append(getLinearRegressionMetrics(lr_model, X_pca_test, Y_pca_test))

In [83]:
ridge_stats = pd.DataFrame(pca_metrics, columns=['R2','MAE','MSE'], index=['ridge_svd','ridge_eig'])
print(ridge_stats)
stats = ridge_stats.append(linear_stats)
print(stats)

In [84]:
#Using PCA+Ridge on testing data
t = pd.read_csv(data_path+'test.csv')
t.head()

In [85]:
discrete.remove('SalePrice')
test = t[discrete].iloc[:, 1:]

In [86]:
#again using median imputation to fill na values
test.fillna(test.median(), inplace=True)

In [88]:
#log transform for skewed features on test data
t_skewed_features = [col for col in test.columns if abs(test[col].skew()) > 0.5]
for feature in t_skewed_features:
    X[feature] = np.log1p(X[feature])

In [93]:
pca.fit(test)
X_pca = pca.transform(test)
print(X_pca)

In [94]:
ss.fit(X_pca)
X_test = ss.transform(X_pca)
print(X_test)

In [97]:
ridge_model = pca_models[0]
Y_pred = ridge_model.predict(X_test)
predictions = {
    'Id': t['Id'].astype('int32'),
    'SalePrice': Y_pred,
}
submission = pd.DataFrame(predictions)
print(submission.info())
print(submission)

In [None]:
submission.to_csv('submission.csv', index=false)