In [2]:
import cudf as pd
import cupy as cp
import cuml

In [61]:
# read dataset
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [62]:
df_train.head()

In [63]:
print(df_train.shape, df_test.shape)

In [64]:
df_train.info()

In [65]:
# preprocessing provided data

no_of_train = df_train.shape[0]
no_of_test = df_test.shape[0]

In [66]:
y_train = df_train['SalePrice']
df_train.drop('SalePrice', axis = 1, inplace=True)

In [67]:
x_train

In [68]:
y_train

In [69]:
df_train.isnull().sum()

In [70]:
drop_cols = []
drop_cols_test = []
for col in df_train:
    if df_train[col].isnull().sum() > (0.70*df_train.shape[0]):
        drop_cols.append(col)
    if df_test[col].isnull().sum() > (0.70*df_test.shape[0]):
        drop_cols_test.append(col)
print(drop_cols)
print(drop_cols_test)

In [71]:
for col in drop_cols:
    df_train.drop(col, axis = 1, inplace = True)
    df_test.drop(col, axis = 1, inplace = True)

In [72]:
df_train.head()

In [73]:
import seaborn as sns

In [74]:
# checking null values on heatmap
temp = df_train
temp = temp.to_pandas()
sns.heatmap(temp.isnull())

In [75]:
no_of_catfeatures = df_train.dtypes[df_train.dtypes != "object"].index
no_of_numfeatures = df_train.dtypes[df_train.dtypes == "object"].index

print("no of categorical features: ", len(no_of_catfeatures))
print("no of numerical features: ", len(no_of_numfeatures))

In [76]:
# filling na values of categorical features & numerical features in df_train
for col in no_of_catfeatures:
    df_train[col] = df_train[col].fillna(df_train[col].mode()[0])
for col in no_of_numfeatures:
    df_train[col] = df_train[col].fillna(df_train[col].mode()[0])

# filling na values of categorical features & numerical features in df_train
for col in no_of_catfeatures:
    df_test[col] = df_test[col].fillna(df_test[col].mode()[0])
for col in no_of_numfeatures:
    df_test[col] = df_test[col].fillna(df_test[col].mode()[0])

In [77]:
# checking null values on heatmap in train
temp = df_train
temp = temp.to_pandas()
sns.heatmap(temp.isnull())

In [78]:
# checking null valyes on heatmap in test
temp = df_test
temp = temp.to_pandas()
sns.heatmap(temp.isnull())

In [79]:
# as we can see all null values have been preprocessed

In [82]:
df_final = pd.concat([df_train, df_test], axis=0)
df_final

In [84]:
# converting categorical features to indicators
df_final = pd.get_dummies(df_final)
df_final

In [85]:
from cuml.model_selection import train_test_split

In [86]:
df_final

In [91]:
no_of_test

In [103]:
df_train = df_final.iloc[:no_of_train, :]
df_test = df_final.iloc[no_of_train: , :]

In [106]:
df_train

In [107]:
df_test

In [108]:
x_train, x_test, y_train, y_test = train_test_split(df_train, y_train, test_size = 0.3, random_state = 101)

In [114]:
from cuml.linear_model import LinearRegression
from cuml.metrics import mean_squared_error, r2_score, mean_absolute_error

In [128]:
algos = ['svd', 'eig', 'svd-qr', 'svd-jacobi']
results = pd.DataFrame(columns = ['algorithm','MSE', 'MAE', 'R2_Score'])
results

In [133]:
res = {'algorithm':[], 'MSE':[], 'MAE':[], 'R2_Score':[]}

for al in algos:
    model  = LinearRegression(fit_intercept=True, normalize=False, algorithm = al)
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    
    mse = mean_squared_error(y_test.astype('int64'), y_predict.astype('int64'))
    r2score = r2_score(y_test.astype('float32'), y_predict.astype('float32'))
    mae = mean_absolute_error(y_test.astype('int64'), y_predict.astype('int64'))
    
#     print(al, mse, mae, r2score)
    res['algorithm'].append(al)
    res['MSE'].append(mse)
    res['MAE'].append(mae)
    res['R2_Score'].append(r2score)

results = pd.DataFrame(res)
results

# absolute errors of the algorithms

In [136]:
# submission df
y_final_pred = model.predict(df_test)
y_final_pred

In [137]:
# loading sample problem
sample_soln = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_soln

In [138]:
sample_soln['SalePrice'] = y_final_pred
sample_soln.to_csv('final_submission.csv', index=False)
sample_soln