In [107]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import statsmodels.formula.api as smf

In [108]:
#Load data and specific set of columns of interest
df = pd.read_csv('nyc-rolling-sales.csv')
df_sales = df.loc[(df['SALE PRICE'].replace(' -  ','0').astype(float) > 0.00),['BOROUGH','RESIDENTIAL UNITS','COMMERCIAL UNITS','GROSS SQUARE FEET','LAND SQUARE FEET','SALE PRICE']]


In [109]:

#SALE PRICE, LAND AND GROSS SQUARE FEET, AND RESIDENTIAL UNITS ARE object but should be numeric
df_sales['SALE PRICE'] = pd.to_numeric(df_sales['SALE PRICE'], errors='coerce')
df_sales['LAND SQUARE FEET'] = pd.to_numeric(df_sales['LAND SQUARE FEET'], errors='coerce')
df_sales['GROSS SQUARE FEET']= pd.to_numeric(df_sales['GROSS SQUARE FEET'], errors='coerce')
df_sales['RESIDENTIAL UNITS']= pd.to_numeric(df_sales['RESIDENTIAL UNITS'], errors='coerce')
df_sales['log_sale_price']=np.log(df_sales['SALE PRICE'])
df_sales['GSF_PER_RU']=df_sales['GROSS SQUARE FEET']/df_sales['RESIDENTIAL UNITS']

   
#Remove outliers
df_sales_nonoutliers = df_sales[(df_sales['SALE PRICE'] > 100000) & (df_sales['SALE PRICE'] < 5000000) & (df_sales['RESIDENTIAL UNITS'] > 0) & (df_sales['GROSS SQUARE FEET'] > 0) & (df_sales['GROSS SQUARE FEET'] < 10000)]

#drop gross square feet and residential units now that GSF_PER-RU is created
df_sales_nonoutliers.drop(columns=['GROSS SQUARE FEET','RESIDENTIAL UNITS'], axis=1, inplace=True)
 
    
#Drop non-numeric rows
df_sales_nonoutliers = df_sales_nonoutliers.select_dtypes(include=[np.number]).dropna()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [110]:
#Split into training and test set

#Set y-variable
y_output_log_sale = df_sales_nonoutliers['log_sale_price']
y_output_sale = df_sales_nonoutliers['SALE PRICE']

#Create x-variables set
df_sales_nonoutliers.drop(columns=['log_sale_price','SALE PRICE'], axis=1, inplace=True)


#Split the data
train_inp, test_inp, train_out, test_out =  train_test_split(df_sales_nonoutliers, y_output_log_sale, test_size=0.2)


In [118]:
#dataframe transformer
def transform_sales(X):

    #X.drop(columns=['GROSS SQUARE FEET','RESIDENTIAL UNITS'], axis=1, inplace=True)
    
    X['Less_1500'] = np.where(X['GROSS SQUARE FEET'] <= 1500, 1, 0)
    X['In_City'] = np.where(X['BOROUGH'] == 'Manhattan', 1, 0)
    X['Is_Big_Building'] = np.where(X['RESIDENTIAL UNITS'] >= 3, 1, 0)
    X['Is_Luxury'] = np.where(X['SALE PRICE'] >= 2000000, 1, 0)
    X['Is_Affordable'] = np.where(X['SALE PRICE'] <= 1000000, 1, 0)
    X['hasBusiness'] = np.where(X['COMMERCIAL UNITS'] >= 1, 1, 0)
    X['Affordable_BigBuilding'] = X['Is_Affordable'] * X['Is_Big_Building'] 
    X['Log_Sale_Price'] = np.log(X['SALE PRICE'])
    X['PPSF'] = X['SALE PRICE']/X['GROSS SQUARE FEET']
    X['PPRU'] = X['SALE PRICE']/X['RESIDENTIAL UNITS']
    X['Log_GSF'] = np.log(X['GROSS SQUARE FEET'])
    X['Log_LSF'] = np.log(X['LAND SQUARE FEET'])

    return X

In [119]:



#vectorizer = TfidfVectorizer(stop_words='english')
#svd = TruncatedSVD(dim)
#classifier = NaiveBayes()

transformer = FunctionTransformer(transform_sales)
scaler =  StandardScaler(copy=True, with_mean=True, with_std=True)
sklearn_pca = PCA(n_components=3)
regressor = LinearRegression()


pipe = Pipeline([
    ('transform', transformer),
    ('scale', scaler),
    #('pca', sklearn_pca),
    ('regress', regressor)
])

pipe.fit(train_inp, train_out)
#trains.append((dim, f1_score(pipe.predict(train_inp), train_out)))
#tests.append((dim, f1_score(pipe.predict(test_inp), test_out)))

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [116]:
print(r2_score(pipe.predict(test_inp), test_out))

-9.984769157857585


In [117]:
train_inp

Unnamed: 0,BOROUGH,COMMERCIAL UNITS,LAND SQUARE FEET,GSF_PER_RU
36014,3,0,2000.0,1030.000000
84544,5,0,2498.0,1188.500000
78078,5,0,2700.0,700.000000
77946,5,0,4242.0,860.000000
49995,4,0,2000.0,799.000000
78523,5,0,4200.0,1599.000000
81456,5,0,5292.0,1822.000000
37187,3,0,1512.0,1080.000000
20563,2,0,1849.0,2053.000000
38163,3,0,2050.0,808.000000
