### Download and unzip dataset

In [9]:
!kaggle competitions download -c home-data-for-ml-course -p .


home-data-for-ml-course.zip: Skipping, found more recently modified local copy (use --force to force download)


In [10]:
import numpy as np
import pandas as pd
import zipfile


In [11]:
with zipfile.ZipFile("home-data-for-ml-course.zip", "r") as zip_ref:
    zip_ref.extractall(".")  


In [12]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [13]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [14]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


## Preprocessing:
- Drop column Id.
- fillna all numeric column to `0`.
- fillna all object column to `None`.
- Apply log to all numeric columns so that it would not skew the result.

In [15]:
train.drop(columns=['Id'],inplace=True)
test.drop(columns=['Id'],inplace=True)

#fillna with 0 and None
train[train.select_dtypes(include=['number']).columns] = train.select_dtypes(include=['number']).fillna(0)
test[test.select_dtypes(include=['number']).columns] = test.select_dtypes(include=['number']).fillna(0)

train.select_dtypes(include=['object']).fillna('None', inplace=True)
test.select_dtypes(include=['object']).fillna('None', inplace=True)

#apply log to numeric columns
numeric_cols = train.select_dtypes(include=['number']).columns
train[numeric_cols] = np.log1p(train[numeric_cols])
numeric_cols = test.select_dtypes(include=['number']).columns
test[numeric_cols] = np.log1p(test[numeric_cols])

print(train.shape) 
print(test.shape)   

(1460, 80)
(1459, 79)


- For categorical columns, apply one-hot encoding

In [16]:
#one hot encoding for categorical columns
train = pd.get_dummies(train).astype(float)
test = pd.get_dummies(test).astype(float)

In [17]:
train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,4.110874,4.189655,9.04204,2.079442,1.791759,7.6029,7.6029,5.283204,6.561031,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.044522,4.394449,9.169623,1.94591,2.197225,7.589336,7.589336,0.0,6.886532,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4.110874,4.234107,9.328212,2.079442,1.791759,7.601902,7.602401,5.09375,6.188264,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4.26268,4.110874,9.164401,2.079442,1.791759,7.557995,7.586296,0.0,5.379897,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,4.110874,4.442651,9.565284,2.197225,1.791759,7.601402,7.601402,5.860786,6.486161,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
test.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,3.044522,4.394449,9.360741,1.791759,1.94591,7.58172,7.58172,0.0,6.150603,4.976734,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.044522,4.406719,9.565775,1.94591,1.94591,7.580189,7.580189,4.691348,6.828712,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4.110874,4.317488,9.534668,1.791759,1.791759,7.599902,7.600402,0.0,6.674561,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4.110874,4.369448,9.208238,1.94591,1.94591,7.600402,7.600402,3.044522,6.401917,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4.795791,3.78419,8.518392,2.197225,1.791759,7.597396,7.597396,0.0,5.575949,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


- Split the training dataset to `X_train` and `y_train`
- align `X_train` and `X-test` for correct dimension

In [19]:

X_train = train.drop(columns=['SalePrice'])
y_train = train['SalePrice']  

X_train, X_test = X_train.align(test, join='left', axis=1, fill_value=0)

X_train = X_train.values
y_train = y_train.values
X_test = X_test.values


In [20]:
print(X_train.shape)  
print(X_test.shape)   

(1460, 287)
(1459, 287)


- Define `L2reg` function for closed form solution.

In [21]:
def L2reg(X,y, lambda_reg=1.0):
    X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Add bias term
    n_features = X_b.shape[1]
    
    # Identity matrix for regularization (excluding bias term)
    I = np.eye(n_features)
    I[0, 0] = 0  # Do not regularize the bias term

    theta = np.linalg.inv(X_b.T.dot(X_b) + lambda_reg * I).dot(X_b.T).dot(y)
    return theta


- Compute `theta` then use it to predict the `test` dataset.

In [22]:
theta=L2reg(X_train,y_train)
print(theta)
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
y_pred = X_test_b.dot(theta)
y_pred=np.exp(y_pred)

[ 5.52629872e+00 -2.55410344e-02 -2.75464963e-03  7.97157096e-02
  2.45430898e-01  2.30997001e-01  7.24669371e-02  8.35010508e-02
 -1.00313838e-03  1.07394367e-02 -6.23474500e-03 -5.56715733e-03
  5.60266900e-02  3.58561163e-02  1.57574968e-03 -7.55549288e-03
  3.69283931e-01  3.46783434e-02  7.95224273e-03  7.00266546e-02
  4.24680913e-02 -6.29260584e-03 -1.07196052e-01  4.28761268e-02
  2.42142441e-02 -1.44317175e-02  1.04464657e-01  1.78086256e-02
  4.12002480e-03  1.95930455e-03  8.32172332e-04  4.92516249e-03
  8.87905997e-03  1.49154179e-02 -4.98781256e-03  2.81330453e-03
 -2.65686568e-03 -2.94269257e-01  9.49750469e-02  8.14344546e-02
  7.06046842e-02  4.72550716e-02 -4.59303379e-02  4.59303379e-02
 -9.70351428e-04  3.81879675e-02 -2.48914419e-04  1.71413453e-02
 -2.79198014e-02  1.10273706e-02 -3.20886294e-02  3.00942836e-02
 -1.26368445e-02  1.46311902e-02  8.49150844e-02 -8.49150843e-02
  2.41116682e-02  4.63422104e-02 -2.60183460e-02 -5.44847325e-02
  1.00492000e-02  7.66759

- Write to csv to submit to kaggle.

In [None]:
submission = pd.DataFrame({'Id': test.index + 1461, 'SalePrice': y_pred})
submission.to_csv('submission.csv', index=False)

: 

### Screenshot here: https://github.com/phuongwhuynh/NLP_Lab/blob/main/Lab67/hw1/sceenshot_hw1.png