In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sns
%matplotlib inline


In [2]:
data = pd.read_csv('AmesHousing.txt', delimiter='\t')

In [3]:
print(data.shape)
data.isnull().sum()

(2930, 82)


Order                0
PID                  0
MS SubClass          0
MS Zoning            0
Lot Frontage       490
Lot Area             0
Street               0
Alley             2732
Lot Shape            0
Land Contour         0
Utilities            0
Lot Config           0
Land Slope           0
Neighborhood         0
Condition 1          0
Condition 2          0
Bldg Type            0
House Style          0
Overall Qual         0
Overall Cond         0
Year Built           0
Year Remod/Add       0
Roof Style           0
Roof Matl            0
Exterior 1st         0
Exterior 2nd         0
Mas Vnr Type        23
Mas Vnr Area        23
Exter Qual           0
Exter Cond           0
                  ... 
Bedroom AbvGr        0
Kitchen AbvGr        0
Kitchen Qual         0
TotRms AbvGrd        0
Functional           0
Fireplaces           0
Fireplace Qu      1422
Garage Type        157
Garage Yr Blt      159
Garage Finish      159
Garage Cars          1
Garage Area          1
Garage Qual

In [4]:
def transform_features(df):
    null_totals = df.isnull().sum()
    #new_df = df[null_totals[null_totals/len(train)<0.25].index]
    new_df = df[null_totals[null_totals ==0].index]
    new_df.drop(['Order', 'PID'], axis=1, inplace=True)
    new_df['SalePrice'] = np.log1p(new_df['SalePrice'])
    
    dummy_cols = []
    cols = new_df.columns
    for col in cols:
        if new_df[col].dtype == 'int':
            new_df[col].fillna(new_df[col].mean())
        elif new_df[col].dtype == 'object':
            dummy = pd.get_dummies(new_df[col])
            new_df = pd.concat([new_df, dummy],axis=1)
            new_df.drop(col, axis=1, inplace=True) 
        
    return new_df
    

In [12]:
new_df = transform_features(data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Columns: 219 entries, MS SubClass to Partial
dtypes: float64(1), int64(25), uint8(193)
memory usage: 1.1 MB


In [7]:
new_df.head()

Unnamed: 0,MS SubClass,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,...,New,Oth,VWD,WD,Abnorml,AdjLand,Alloca,Family,Normal,Partial
0,20,31770,6,5,1960,1960,1656,0,0,1656,...,0,0,0,1,0,0,0,0,1,0
1,20,11622,5,6,1961,1961,896,0,0,896,...,0,0,0,1,0,0,0,0,1,0
2,20,14267,6,6,1958,1958,1329,0,0,1329,...,0,0,0,1,0,0,0,0,1,0
3,20,11160,7,5,1968,1968,2110,0,0,2110,...,0,0,0,1,0,0,0,0,1,0
4,60,13830,5,5,1997,1998,928,701,0,1629,...,0,0,0,1,0,0,0,0,1,0


In [8]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Columns: 219 entries, MS SubClass to Partial
dtypes: float64(1), int64(25), uint8(193)
memory usage: 1.1 MB


In [9]:
new_df.isnull().sum()

MS SubClass        0
Lot Area           0
Overall Qual       0
Overall Cond       0
Year Built         0
Year Remod/Add     0
1st Flr SF         0
2nd Flr SF         0
Low Qual Fin SF    0
Gr Liv Area        0
Full Bath          0
Half Bath          0
Bedroom AbvGr      0
Kitchen AbvGr      0
TotRms AbvGrd      0
Fireplaces         0
Wood Deck SF       0
Open Porch SF      0
Enclosed Porch     0
3Ssn Porch         0
Screen Porch       0
Pool Area          0
Misc Val           0
Mo Sold            0
Yr Sold            0
SalePrice          0
A (agr)            0
C (all)            0
FV                 0
I (all)            0
                  ..
Gd                 0
Po                 0
TA                 0
Maj1               0
Maj2               0
Min1               0
Min2               0
Mod                0
Sal                0
Sev                0
Typ                0
N                  0
P                  0
Y                  0
COD                0
CWD                0
Con          

In [10]:
def select_features(df):
    return df[['Gr Liv Area', 'SalePrice']]

In [22]:

def train_and_test(df):
    X = df.drop('SalePrice', axis=1)
    y = df['SalePrice']
    kf = KFold(n_splits=10, shuffle=True)
    lr = LinearRegression()
    mses = cross_val_score(lr, X, y, 
                           scoring='mean_squared_error',cv=kf)

    rmses = [(abs(mse)**.5) for mse in mses]
    #avg_rmse = np.mean(rmses)
    
    scores = cross_val_score(lr, X, y, cv=kf)
    
    
    
    print('RMSE:{}'.format(rmses))
    print('Average RMSE: {:.2f}'.format(np.mean(rmses)))
    print('Cross-Val Standard_Scores: {}'.format(scores))
    print('Average Cross-Val Standard: {}'.format(scores.mean()))
    
    
    
    
    

In [23]:
train_and_test(new_df)

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


RMSE:[0.17606397211652897, 66.872574906831332, 0.17250741532006977, 14.263171244243427, 0.12991242059187075, 0.12595492655733936, 0.17414700809091405, 0.14794777253665473, 0.11887064786061252, 929.49426672087395]
Average RMSE: 101.17
Cross-Val Standard_Scores: [ 0.90106381  0.78594415  0.88430367  0.90795744  0.72824998  0.89062961
  0.90405678  0.92143667  0.85730713  0.91792324]
Average Cross-Val Standard: 0.869887248448947
