In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('datasets/train.csv', keep_default_na=False, na_values=[''])
train.isnull().sum().sort_values(ascending=False).head(10)
kaggle = pd.read_csv('datasets/test.csv', keep_default_na=False, na_values=['']) # test dataset
sample = pd.read_csv('datasets//sample_sub_reg.csv')

In [3]:
train['baseline'] = train['SalePrice'].mean()

In [4]:
train.columns = [col.replace(' ', '_') for col in train.columns]

In [5]:
train.columns = train.columns.str.lower()

In [6]:
train.rename(columns={'saleprice': 'sale_price'}, inplace=True)

In [7]:
train['total_bath'] = train['full_bath'] + train['half_bath']

In [8]:
train.central_air = train.central_air.apply(lambda x: 0 if x.strip()=='N' else 1)

In [9]:
train.central_air.unique()

array([1, 0], dtype=int64)

In [10]:
features = [
    'gr_liv_area', 
    'overall_qual', 
    'year_built',
]

In [11]:
train[features].isnull().sum()

gr_liv_area     0
overall_qual    0
year_built      0
dtype: int64

In [12]:
X = train[['gr_liv_area', 'overall_qual', 'year_built']]
y = train['sale_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
len(train_test_split(X, y,
                train_size=0.8, 
                random_state=123))

4

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8,
                                                    random_state=123)
print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes:", X_test.shape, y_test.shape)

Train shapes: (1640, 3) (1640,)
Test shapes: (411, 3) (411,)


In [15]:
X = train[features]
y = train.sale_price

mlr = LinearRegression()
mlr.fit(X, y)

y_pred = mlr.predict(X)

print('MLR score:', mlr.score(X, y))
print('MLR intercept:', mlr.intercept_)
print('MLR coeffs:', mlr.coef_)

MLR score: 0.7503580983399765
MLR intercept: -1053151.1020957488
MLR coeffs: [   60.49814858 26067.58646576   499.35634488]


In [16]:
kaggle['Bldg Type'].unique()

array(['2fmCon', 'Duplex', '1Fam', 'TwnhsE', 'Twnhs'], dtype=object)

In [17]:
kaggle.isnull().sum()

Id                0
PID               0
MS SubClass       0
MS Zoning         0
Lot Frontage    160
               ... 
Misc Feature      0
Misc Val          0
Mo Sold           0
Yr Sold           0
Sale Type         0
Length: 80, dtype: int64

In [18]:
kaggle.dtypes

Id                int64
PID               int64
MS SubClass       int64
MS Zoning        object
Lot Frontage    float64
                 ...   
Misc Feature     object
Misc Val          int64
Mo Sold           int64
Yr Sold           int64
Sale Type        object
Length: 80, dtype: object

In [19]:
kaggle.columns = [col.replace(' ', '_') for col in kaggle.columns]

In [20]:
kaggle.columns = kaggle.columns.str.lower()

In [21]:
kaggle.rename(columns={'saleprice': 'sale_price'}, inplace=True)

In [22]:
kaggle['total_bath'] = kaggle['full_bath'] + kaggle['half_bath']

In [23]:
kaggle.central_air = kaggle.central_air.apply(lambda x: 0 if x.strip()=='N' else 1)

In [24]:
kaggle.central_air.unique()

array([0, 1], dtype=int64)

In [25]:
mlr = LinearRegression()
mlr.fit(X, y)

y_pred = mlr.predict(X)

In [26]:
kaggle_subset = kaggle[X.columns]

kaggle_preds = mlr.predict(kaggle_subset)

In [27]:
preds_df = pd.DataFrame({
    'Id': kaggle['id'],
    'SalePrice': kaggle_preds
})

preds_df.head(3)

Unnamed: 0,Id,SalePrice
0,2658,173665.465897
1,2718,183414.182333
2,2414,221536.061283


In [31]:
preds_df.to_csv('kaggle_822.csv', index=False)

In [30]:
preds_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         878 non-null    int64  
 1   SalePrice  878 non-null    float64
dtypes: float64(1), int64(1)
memory usage: 13.8 KB
