In [None]:
# python version: python3

In [1]:
import os
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

## DATA EXPLORATION & PREPARATION

### part a

In [2]:
train_df = pd.read_csv('train.csv')                       #open the file and 
train_x_a =train_df.drop("SalePrice", axis=1)             #for prediction made 2 dataframe one is label one is feauture
train_y = pd.DataFrame(train_df["SalePrice"])

In [3]:
train_x_a.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,0,2,1,3,1,8,0,2,548,2
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,1,2,0,3,1,6,1,2,460,5


In [4]:
train_y.head(2)

Unnamed: 0,SalePrice
0,208500
1,181500


### part b

In [5]:
train_y.isnull().sum()     #its check the label about null values if there is exist that rows will be delete

SalePrice    0
dtype: int64

In [6]:
nan_columns = [train_x_a.isnull().sum()]     #detect features columns which has nan values
train_x_b =train_x_a.fillna(train_x_a.median())       #filled missed values with the mean of that corresponding columns

In [7]:
print(nan_columns)

[MSSubClass        0
LotFrontage     173
LotArea           0
Street            0
OverallQual       0
OverallCond       0
MasVnrArea        6
BsmtFinSF1        0
BsmtUnfSF         0
TotalBsmtSF       0
1stFlrSF          0
2ndFlrSF          0
LowQualFinSF      0
GrLivArea         0
BsmtFullBath      0
BsmtHalfBath      0
FullBath          0
HalfBath          0
BedroomAbvGr      0
KitchenAbvGr      0
TotRmsAbvGrd      0
Fireplaces        0
GarageCars        0
GarageArea        0
MoSold            0
dtype: int64]


In [8]:
train_x_b.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,...,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold
0,60,65.0,8450,Pave,7,5,196.0,706,150,856,...,0,2,1,3,1,8,0,2,548,2
1,20,80.0,9600,Pave,6,8,0.0,978,284,1262,...,1,2,0,3,1,6,1,2,460,5


### part c

In [9]:
categorical_columns = [train_x_b.select_dtypes(include=['object']).columns]   
#finding the categorical columns with dtype which is object

In [10]:
print(categorical_columns)

[Index(['Street'], dtype='object')]


### part d

In [11]:
train_x_d =pd.get_dummies(train_x_b)
#its get the categorical columns and added end of the df with their categories its one-hot encoding 

In [12]:
train_x_d.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Street_Grvl,Street_Pave
0,60,65.0,8450,7,5,196.0,706,150,856,856,...,1,3,1,8,0,2,548,2,0,1
1,20,80.0,9600,6,8,0.0,978,284,1262,1262,...,0,3,1,6,1,2,460,5,0,1


### part e

In [13]:
train_x_e = preprocessing.StandardScaler().fit_transform(train_x_d) #all columns are standardized
train_x_e = pd.DataFrame(train_x_e, columns=train_x_d.columns) 
#after standardization it became numpy array and it converted to dataframe

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [14]:
train_x_e.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Street_Grvl,Street_Pave
0,0.073872,-0.235351,-0.196474,0.6329,-0.529618,0.464035,0.589782,-0.939672,-0.486827,-0.802481,...,1.230454,0.178216,-0.208547,0.935889,-0.939129,0.316364,0.357524,-1.601265,-0.063372,0.063372
1,-0.873204,0.475965,-0.095659,-0.090414,2.177118,-0.576236,1.204925,-0.638291,0.475863,0.280104,...,-0.773542,0.178216,-0.208547,-0.307817,0.600426,0.316364,-0.063938,-0.485919,-0.063372,0.063372


## LINEAR REGRESSION TO PREDICT HOUSE PRICES

### part f

In [15]:
lr_model =linear_model.LinearRegression()   
lr_model.fit(train_x_e, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
mse = mean_squared_error(train_y, lr_model.predict(train_x_e))
print(mse)

1040038040.0328163


### part g

In [17]:
train_mse_score =cross_val_score(lr_model, train_x_e, train_y, cv=5, scoring="neg_mean_squared_error") #calculation mse 
                                                                                                       #using lr_model
print(train_mse_score)

[-9.65633604e+08 -7.52441904e+08 -1.66781305e+09 -1.46841147e+09
 -8.84970495e+08]


In [18]:
average_mse = train_mse_score.mean()      #taked mean of mse
print(average_mse)

-1147854103.2282043


### part h

In [19]:
test_df = pd.read_csv('test.csv')  
test_x_a = test_df.drop("SalePrice", axis=1)             #same instrustions as first part
test_y = pd.DataFrame(test_df["SalePrice"])

In [20]:
test_y.isnull().sum() #its check the label about null values if there is exist that rows will be delete

SalePrice    0
dtype: int64

In [21]:
nan_columns_test_x_a = [test_x_a.isnull().sum()]     #detect features columns which has nan values
test_x_a =test_x_a.fillna(train_x_a.median())       #filled missed values with the mean of that corresponding columns

In [22]:
categorical_columns_test_x = [test_x_a.select_dtypes(include=['object']).columns]   
#finding the categorical columns with dtype which is object

In [23]:
test_x_a =pd.get_dummies(test_x_a)
#its get the categorical columns and added end of the df with their categories its one-hot encoding 

In [24]:
test_x = preprocessing.StandardScaler().fit_transform(test_x_a) #all columns are standardized
test_x = pd.DataFrame(test_x, columns=test_x_a.columns) 
#after standardization it became numpy array and it converted to dataframe

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [25]:
test_x.head(2)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,MasVnrArea,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,GarageArea,MoSold,Street_Grvl,Street_Pave
0,-0.871178,0.15847,0.011886,-2.204028,-2.275158,-0.563571,-0.908588,-1.300225,-2.217961,-0.563416,...,-0.73653,-1.088257,-0.218714,-1.547027,-0.978492,0.30186,0.249123,0.236471,-0.066082,0.066082
1,-0.635309,-0.427043,-0.825874,-0.755667,0.401498,-0.563571,-0.908588,0.287067,-0.756925,-1.180596,...,-0.73653,-1.088257,-0.218714,-1.547027,-0.978492,-1.020574,-1.143224,-1.96104,-0.066082,0.066082


In [26]:
test_y.head(2)

Unnamed: 0,SalePrice
0,82000
1,86000


### part i

In [27]:
predicted_values = lr_model.predict(test_x)   #prediction of lr model on test_X

In [28]:
print(predicted_values[10:13])

[[129384.59716894]
 [ 83861.0213064 ]
 [151750.6953265 ]]


In [29]:
test_mse_score =mean_squared_error(test_y, lr_model.predict(test_x))  #calcualtion of mse score on test
print(test_mse_score)

1895947941.0810091


## CLASSIFICATION MODEL TO PREDICT HOUSE PRICE CATEGORY

### part j

In [30]:
train_y_j=pd.DataFrame(train_y)
for i in train_y_j.SalePrice:
    if i < 100000:
        train_y_j=train_y_j.replace(i,1)
    elif i>= 100000 and i<200000:              #changing SalePrice to categorical values as 1,2,3,4,5 
        train_y_j=train_y_j.replace(i,2)
    elif i >= 200000 and i<300000:
        train_y_j=train_y_j.replace(i,3)
    elif i>= 300000 and i<400000:
        train_y_j=train_y_j.replace(i,4)
    elif i>=  400000:
        train_y_j=train_y_j.replace(i,5)

In [31]:
test_y_j=pd.DataFrame(test_y)
for i in test_y_j.SalePrice:
    if i < 100000:
        test_y_j=test_y_j.replace(i,1)
    elif i>= 100000 and i<200000:
        test_y_j=test_y_j.replace(i,2)
    elif i >= 200000 and i<300000:
        test_y_j=test_y_j.replace(i,3)
    elif i>= 300000 and i<400000:                     #samely for test
        test_y_j=test_y_j.replace(i,4)
    elif i>=  400000:
        test_y_j=test_y_j.replace(i,5)


In [32]:
train_y_j.head(2)

Unnamed: 0,SalePrice
0,3
1,2


In [33]:
test_y_j.head(2)

Unnamed: 0,SalePrice
0,1
1,1


### part k

In [34]:
from sklearn.linear_model import SGDClassifier

model =SGDClassifier(max_iter=5, random_state=42)    #multiclass classification model on train
model.fit(train_x_e, train_y_j)
model.predict(test_x)

  y = column_or_1d(y, warn=True)


array([1, 1, 3, 2, 2, 2, 2, 1, 3, 2, 2, 2, 2, 1, 2, 3, 3, 3, 2, 2, 2, 3,
       2, 2, 3, 2, 2, 3, 2, 2, 2, 5, 3, 3, 2, 1, 3, 3, 2, 1, 3, 2, 2, 3,
       3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 3,
       2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       2, 3, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 3,
       3, 2, 2, 2, 2, 3, 3, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 3, 3, 2, 3, 2,
       2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 3, 1, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 3, 3, 2, 2, 3, 2, 2, 2, 2, 3, 3, 2, 4, 2, 2, 2, 2, 2, 3,
       2, 2, 2, 1, 3, 2, 5, 2, 2, 2, 2, 3, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 3, 2, 2, 2, 3, 2, 3, 2, 3, 3, 2, 2, 2, 2, 2, 2, 3, 1, 2,
       2, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 3, 3, 3,
       3, 3, 2, 3, 3, 2, 3, 2, 3, 2, 2, 3, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2,
       2, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 3, 3, 2, 2, 3, 2,
       2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 5, 2, 3,

### part l

In [35]:
from sklearn.model_selection import cross_val_predict
predict_train=cross_val_predict(model, train_x_e, train_y_j, cv=5)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [36]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
#calculation of accuracy, precision, recall,  f1 score and confusion matrix
accuracy = accuracy_score(train_y_j, predict_train)
precision = precision_score(train_y_j, predict_train, average="micro")
recall = recall_score(train_y_j, predict_train, average="micro")
f1 = f1_score(train_y_j, predict_train, average="micro")
confusion_matrix=confusion_matrix(train_y_j, predict_train)

In [37]:
print(confusion_matrix)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 

[[ 30  46   1   0   0]
 [ 12 524  76   2   0]
 [  1  80 124  16   0]
 [  0   4  45  17   1]
 [  0   0  11   8   2]]
accuracy: 0.697
precision: 0.697
recall: 0.697
f1: 0.697


### part m

In [38]:
predicted_values =model.predict(test_x)

In [39]:
print(predicted_values[20:23])

[2 3 2]


In [40]:
from sklearn.metrics import confusion_matrix
# samely for testing
accuracy = accuracy = accuracy_score(test_y_j, predicted_values)
precision = precision_score(test_y_j, predicted_values, average="micro")
recall = recall_score(test_y_j, predicted_values, average="micro")
f1 = f1_score(test_y_j, predicted_values, average="micro")
confusion_matrix =confusion_matrix(test_y_j, predicted_values)

In [41]:
print(confusion_matrix)
print("accuracy: {}\nprecision: {}\nrecall: {}\nf1: {}".format(accuracy,precision,recall,f1)) 

[[ 18  19   0   0   0]
 [  4 247  43   1   2]
 [  0  27  72   0   0]
 [  0   1  19   0   0]
 [  0   0   5   1   1]]
accuracy: 0.7347826086956522
precision: 0.7347826086956522
recall: 0.7347826086956522
f1: 0.7347826086956522
