In [30]:
import pandas as pd

# Putting AmesHousing.txt data into a dataframe - https://github.com/marcelpinheiro/malum/blob/master/AmesHousing.txt
data = pd.read_csv('AmesHousing.txt', encoding='UTF-8', delimiter='\t')
data = data.fillna(0) #Replace all Nan values to 0

data



Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,0,IR1,Lvl,...,0,0,0,0,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,0,Reg,Lvl,...,0,0,MnPrv,0,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,0,IR1,Lvl,...,0,0,0,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,0,Reg,Lvl,...,0,0,0,0,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,0,IR1,Lvl,...,0,0,MnPrv,0,0,3,2010,WD,Normal,189900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,2926,923275080,80,RL,37.0,7937,Pave,0,IR1,Lvl,...,0,0,GdPrv,0,0,3,2006,WD,Normal,142500
2926,2927,923276100,20,RL,0.0,8885,Pave,0,IR1,Low,...,0,0,MnPrv,0,0,6,2006,WD,Normal,131000
2927,2928,923400125,85,RL,62.0,10441,Pave,0,Reg,Lvl,...,0,0,MnPrv,Shed,700,7,2006,WD,Normal,132000
2928,2929,924100070,20,RL,77.0,10010,Pave,0,Reg,Lvl,...,0,0,0,0,0,4,2006,WD,Normal,170000


Como você pode ver, nós temos um dataframe com 2930 linhas x 82 colunas. Mas como o Sklearn não trabalha muito bem com strings, nós temos que converter estas colunas/atributos de texto para números (int) - Veja https://malum.com.br/wp/2019/11/24/normalizando-dados-com-sklearn/


As you can see, we have a dataframe with 2930 rows × 82 columns. But since Sklearn doesn´t work well with strings, we have to convert thoses columns/features that is text to numbers (int) - See https://malum.com.br/wp/2019/11/24/normalizando-dados-com-sklearn/ 



In [35]:

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int)

VecValues = vec.fit_transform(data.to_dict(orient='records')) #To work right, we have to convert our dataframe to a dict
VecColNames = vec.get_feature_names()

#Converting the string features/columns into int and put into a dataframe (again)
df = pd.DataFrame(VecValues, columns = VecColNames) 

df

Unnamed: 0,1st Flr SF,2nd Flr SF,3Ssn Porch,Alley,Alley=Grvl,Alley=Pave,Bedroom AbvGr,Bldg Type=1Fam,Bldg Type=2fmCon,Bldg Type=Duplex,...,Street=Pave,TotRms AbvGrd,Total Bsmt SF,Utilities=AllPub,Utilities=NoSeWa,Utilities=NoSewr,Wood Deck SF,Year Built,Year Remod/Add,Yr Sold
0,1656,0,0,0,0,0,3,1,0,0,...,1,7,1080,1,0,0,210,1960,1960,2010
1,896,0,0,0,0,0,2,1,0,0,...,1,5,882,1,0,0,140,1961,1961,2010
2,1329,0,0,0,0,0,3,1,0,0,...,1,6,1329,1,0,0,393,1958,1958,2010
3,2110,0,0,0,0,0,3,1,0,0,...,1,8,2110,1,0,0,0,1968,1968,2010
4,928,701,0,0,0,0,3,1,0,0,...,1,6,928,1,0,0,212,1997,1998,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,1003,0,0,0,0,0,3,1,0,0,...,1,6,1003,1,0,0,120,1984,1984,2006
2926,902,0,0,0,0,0,2,1,0,0,...,1,5,864,1,0,0,164,1983,1983,2006
2927,970,0,0,0,0,0,3,1,0,0,...,1,6,912,1,0,0,80,1992,1992,2006
2928,1389,0,0,0,0,0,2,1,0,0,...,1,6,1389,1,0,0,240,1974,1975,2006


Você pode observar que agora temos muito mais colunas! Mas por que? O DictVectorizer criará uma nova coluna para cada opção de uma coluna. Exemplo: a coluna Utilities possui 3 opções: AllPub, NoSeWa e NoSewr. Então, três colunas foram criadas: 'Utilities = AllPub', 'Utilities = NoSeWa' e 'Utilities = NoSewr'


You can observe that now we have a lot more columns! But Why?
DictVectorizer will create a new column for each option of a column. Example: Utilities column has 3 options: AllPub, NoSeWa and NoSewr. So 3 columns were created: **'Utilities=AllPub', 'Utilities=NoSeWa' and 'Utilities=NoSewr'**

In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

#Here we are splitting our data with 2 pieces: train and test. Test will have 33% of data; train will have all the rest
test, train = train_test_split(df,test_size=0.33, random_state=42)


testSalePrice = test['SalePrice']
trainSalePrice = train['SalePrice']

train = train.drop(['SalePrice'], axis=1)
test = test.drop(['SalePrice'], axis=1)

model = LinearRegression()

model.fit(train, trainSalePrice) 

predict = model.predict(test)

predict

array([219788.95359128, 209115.82993554, 134401.6021712 , ...,
       210418.01853877, 156653.74857343, 146886.61122666])

Excellent, we have our prediction Sale Prices! But how can we measure how accuracy is this prediction? Using the mean_squared_error from sklearn

In [33]:
from sklearn.metrics import mean_squared_error
import numpy as np

MSE = mean_squared_error(predict,testSalePrice)
RMSE = np.sqrt(MSE) 
print('MSE:',MSE,'RMSE:',RMSE)

MSE: 1114197668.6919012 RMSE: 33379.59958854961
