In [134]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



In [135]:
# Load the data 
train_data = pd.read_csv('../Data/train.csv')
test_data = pd.read_csv('../Data/test.csv')

print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [136]:
# priliminary analysis
print(train_data.describe())
print(train_data.info())


                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ..

In [137]:
# check for missing values in train_data. print columns with null values greater than 0
print(train_data.isnull().sum()[train_data.isnull().sum() > 0])


LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [138]:
# check for missing values in test_data. print columns with null values greater than 0
print(test_data.isnull().sum()[test_data.isnull().sum() > 0])

MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType       894
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
SaleType           1
dtype: int64


In [139]:
# fill missing values in train_data and test_data with mode of the column
train_data.fillna(train_data.mode().iloc[0], inplace=True)
test_data.fillna(test_data.mode().iloc[0], inplace=True)

In [140]:
# seperate the features and target variable 
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

In [141]:
# find the categorical columns
train_categorical_columns = X.select_dtypes(include=['object']).columns
test_categorical_columns = test_data.select_dtypes(include=['object']).columns
print(len(train_categorical_columns))
print(len(test_categorical_columns))


43
43


In [142]:
print(len(X.columns))
# save the column names to column_names.txt
with open('column_names.txt', 'w') as f:
    for col in X.columns:
        f.write(col + '\n')

80


In [143]:

# replace different categorical values with numbers in place and without creating a new column using orninal encoder in sklearn
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
# convert categorical columns to strings
X[train_categorical_columns] = X[train_categorical_columns].astype(str)
test_data[test_categorical_columns] = test_data[test_categorical_columns].astype(str)
# fit and transform the data
X[train_categorical_columns] = encoder.fit_transform(X[train_categorical_columns])
test_data[test_categorical_columns] = encoder.fit_transform(test_data[test_categorical_columns])


# check if the number of columns in the train and test data are the same
print(len(X.columns))
print(len(test_data.columns))

80
80


In [144]:
# combine YearBuilt and YrSold to get the Age feature and drop the YearBuilt and YrSold columns
X['Age'] = X['YrSold'] - X['YearBuilt']
X.drop(['YearBuilt', 'YrSold'], axis=1, inplace=True)

test_data['Age'] = test_data['YrSold'] - test_data['YearBuilt']
test_data.drop(['YearBuilt', 'YrSold'], axis=1, inplace=True)

In [145]:

set(train_data['MasVnrType']) #issue here. code not recogniying None value as a category. the column has nan and None values

{'BrkCmn', 'BrkFace', 'Stone'}

In [146]:
# fit a regression model with Lasso regression
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.1)
model.fit(X, y)


In [147]:
# make predictions
predictions = model.predict(test_data)

# save the predictions to a submission.csv file
output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})
output.to_csv('../Data/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
