In [192]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [193]:
# Importing GPU Libraries
import cudf as pd
import cupy as np
import cuml

In [194]:
df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')


In [195]:
df.info()

In [196]:
df.describe()

In [197]:
#We do this step in order to choose columns having int and float datatype
columns_needed = []
for col in df.columns:
    if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
        columns_needed.append(col)
new_df = df[columns_needed]

In [198]:
new_df.info()


In [199]:
new_df.fillna(new_df.mean(),inplace=True) # let's the user replace NaN values with some value of their own.

In [200]:
new_df.isnull().sum()

In [201]:
X = new_df.iloc[:, 1:-1]
Y = new_df.iloc[:,-1]

In [202]:
X.head(7)

In [203]:
Y.head()

In [204]:
skewed_feature = [col for col in X.columns if abs(X[col].skew())>0.5]
len(skewed_feature)

In [205]:
skewed_feature

In [206]:
#Using log method bcz it is used for higly and moderately skewed cols 

for feature in skewed_feature:
    X[feature] = np.log1p(X[feature])

In [207]:
#Mean-centering the features
from cuml.preprocessing import StandardScaler
std = StandardScaler()
X = std.fit_transform(X)

In [208]:
from cuml.preprocessing import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.9, random_state=2)
from imblearn.over_sampling import SMOTE
sm=SMOTE(random_state=0)
# X_train_res,Y_train_res=sm.fit_resample(X_train,Y_train)

In [209]:
#calculating mean squared error,mean absolute error and r2 score for each of the 5 models
from cuml.metrics.regression import r2_score, mean_absolute_error, mean_squared_error
def get_accuracy_check(model,model_name,x_test,y_test):
    y_predict = model.predict(x_test)
    y_test = y_test.astype('float64')
    mse =  mean_squared_error(y_test,y_predict)
    mae = mean_absolute_error(y_test,y_predict)
    r2 = r2_score(y_test,y_predict)
    print("MSE for ",model_name, " is ",mse)
    print("MAE for ",model_name, " is ", mae)
    print("R2_score for ",model_name, " is ", r2)
    return [mse, mae, r2] 

In [210]:
from cuml.linear_model import LinearRegression
algos = ["eig", 'qr', "svd", "svd-jacobi", "svd-qr"]
metrics = []
models = []
for algo in algos:
    lr = LinearRegression(algorithm=algo)
    %time lr.fit(X_train, Y_train)
    models.append(lr)
    metrics.append(get_accuracy_check(lr, algo, X_test, Y_test))

# Preprocessing Test data

In [211]:
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [212]:
df_test.info()

In [213]:
columns_needed.remove('SalePrice')

In [214]:
test = df_test[columns_needed].iloc[:, 1:]

In [215]:
test.fillna(test.mean(), inplace=True)

In [216]:
test_skew_features = [col for col in test.columns if abs(test[col].skew()) > 0.5]
len(test_skew_features)

In [217]:
test_skew_features

In [218]:
for feature in test_skew_features:
    test[feature] = np.log1p(test[feature])

In [219]:
X_test1 = std.fit_transform(test)
X_test1.shape

In [220]:
model_used = models[0]
Y_pred = model_used.predict(X_test1)

In [221]:
predictions = {
    'Id': df_test['Id'].astype('int32'),
    'SalePrice': Y_pred,
}
sub = pd.DataFrame(predictions)
print(sub.info())
sub

In [222]:
sub.to_csv('submission.csv', index=False)