In [430]:
import sqlite3
import csv
import pandas as pd
from datetime import datetime

In [454]:
def read_sales_csv(filename):
    sales = []
    with open(filename) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            sales.append({'date': datetime.strptime(row['date'], '%m/%d/%y'),
                     'account_number': row['account_number'],
                     'price': int(row['price'])})
    return sales

conn = sqlite3.connect('NW_Central_OKC_w_land_area.db')
c = conn.cursor()

sales = read_sales_csv('sales_list_2019.csv')
#sales = sales + read_sales_csv('sales_list_2018.csv')
#sales = sales + read_sales_csv('sales_list_2017.csv')

In [455]:
# probably not the best way to do this, very slow
'''
for s in sales:
    t = (s['account_number'],)
    c.execute("SELECT * FROM realproperty WHERE account_number = ?",t)
    print(c.fetchone())
    '''

'\nfor s in sales:\n    t = (s[\'account_number\'],)\n    c.execute("SELECT * FROM realproperty WHERE account_number = ?",t)\n    print(c.fetchone())\n    '

In [456]:
combined_df = pd.read_sql_query("SELECT * FROM realproperty INNER JOIN buildings ON realproperty.id = buildings.local_property_id;", conn)
rp_df = pd.read_sql_query("SELECT * FROM realproperty WHERE property_type = 'Residential'", conn)
sqft_sums = combined_df.groupby(['account_number'])['sq_ft'].sum()

property_df_sb = rp_df.set_index('account_number')
property_df_sb['sqft_sum'] = sqft_sums
display(property_df_sb)

Unnamed: 0_level_0,id,propertyid,property_type,location,building_name_occupant,city,owner_name_1,owner_name_2,billing_address_1,billing_address_2,...,land_size,lot_width,lot_depth,land_value,quarter_section_description,subdivision,block,lot,legal_description,sqft_sum
account_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
R049810025,1,130116,Residential,400 NW 43RD ST,,OKLAHOMA CITY,YOUNGS STEVEN ROGERS,TURLEY ALLISON,400 NW 43RD ST,,...,13068.0,,,47449,Sect 16-T12N-R3W Qtr NE,EDGEMERE TERRACE,001,001,EDGEMERE TERRACE 001 001,1824.0
R049640005,4,131004,Residential,5021 N WALKER AVE,,OKLAHOMA CITY,VEITCH JAMIE PER REP,BRADSHAW TIMOTHY JOE ESTATE,5021 N WALKER AVE,,...,9147.6,,,34216,Sect 16-T12N-R3W Qtr NW,DOUGLAS PLACE ADD,001,000,DOUGLAS PLACE ADD 001 000 LOTS 1 & 2,1352.0
R049810050,7,130117,Residential,408 NW 43RD ST,,OKLAHOMA CITY,JUNGMAN TIMOTHY WILLIAM & JULIE LYN,,408 NW 43RD ST,,...,9147.6,,,41310,Sect 16-T12N-R3W Qtr NE,EDGEMERE TERRACE,001,002,EDGEMERE TERRACE 001 002,2677.0
R049640010,11,131005,Residential,504 NW 50TH ST,,OKLAHOMA CITY,DICKSON JERRY ROBERT,,504 NW 50TH ST,,...,6969.6,,,26320,Sect 16-T12N-R3W Qtr NW,DOUGLAS PLACE ADD,001,000,DOUGLAS PLACE ADD 001 000 LOTS 3 & 4,1386.0
R049810075,12,130118,Residential,412 NW 43RD ST,,OKLAHOMA CITY,STUCKY ANIKA C,,412 NW 43RD ST,,...,9147.6,,,41310,Sect 16-T12N-R3W Qtr NE,EDGEMERE TERRACE,001,003,EDGEMERE TERRACE 001 003,1957.0
R049480080,13,130504,Residential,1009 NW 42ND ST,,OKLAHOMA CITY,SPEARS KENNETH L & REGINA,,210 NW 10TH ST,,...,7405.2,,,59378,Sect 16-T12N-R3W Qtr SW,CROWN HEIGHTS ADD,001,004,CROWN HEIGHTS ADD 001 004,1961.0
R049640015,14,131006,Residential,508 NW 50TH ST,,OKLAHOMA CITY,GARRETT & COMPANY LLC,,9701 BROADWAY EXT,,...,6969.6,,,26320,Sect 16-T12N-R3W Qtr NW,DOUGLAS PLACE ADD,001,000,DOUGLAS PLACE ADD 001 000 LOTS 5 & 6,1092.0
R049810100,15,130119,Residential,418 NW 43RD ST,,OKLAHOMA CITY,BRADEN THOMAS GARY & BEVERLY JOAN,REVOCABLE LIVING TRUST,418 NW 43RD ST,,...,9147.6,,,41310,Sect 16-T12N-R3W Qtr NE,EDGEMERE TERRACE,001,004,EDGEMERE TERRACE 001 004,2047.0
R054850009,16,132187,Residential,1528 NW 41ST ST,,OKLAHOMA CITY,MCMURREY MICHELE L,,1528 NW 41ST ST,,...,7405.2,,,23606,Sect 17-T12N-R3W Qtr SE,PUTNAM HEIGHTS ADD,00A,000,PUTNAM HEIGHTS ADD 00A 000 LOTS 1 2 & 3,1396.0
R049480100,18,130505,Residential,1005 NW 42ND ST,,OKLAHOMA CITY,DK INC,,C/O KENNETH L SPEARS,PO BOX 687,...,7405.2,,,59378,Sect 16-T12N-R3W Qtr SW,CROWN HEIGHTS ADD,001,005,CROWN HEIGHTS ADD 001 005,1660.0


In [457]:
# This filters for subdivision based on number of properties in that sub. Let's change it
# later to filter for number of transactions.
sublist = (property_df_sb[['subdivision']]
            .assign(count = 0)
            .groupby("subdivision")
            .count())
sublist = sublist.where(sublist['count'] >= 100).dropna()
sublist_names = sublist.index.values
#           where(property_df_sb.count(property_df_sb['subdivision']) > 20)
#           ['subdivision'].unique().dropna())
display(sublist_names)

'''
allowed_subs_list = [
    'EDGEMERE TERRACE',
    'DOUGLAS PLACE ADD',
    'CROWN HEIGHTS ADD'
]
'''
property_df_sb['subdivision'] = property_df_sb.apply(lambda x: 'NOT LISTED' if x['subdivision'] not in sublist_names else x['subdivision'], axis=1)
#display(property_df_sb)

array(['AURORA ADDITION', 'BAUMANS ADDITION', 'BELL VERN ADDITION',
       'CARLE & COLCORD ADDITION', 'CASHION PLACE ADDITION',
       'CASHION PLACE AMD', 'CENTRAL PARK ADDITION',
       'CLASSENS CREAM RIDGE', 'COLLEGE ADDITION', 'CRESTWOOD ADDITION',
       'CROWN HEIGHTS ADD', 'DARRALLS SECOND ADD', 'DENNISTON PARK ADD',
       'DOUGLAS PLACE ADD', 'EDGEMERE PARK ADD', 'EDGEMERE TERRACE',
       'EPWORTH VIEW ADD', 'GARDEN ADDITION', 'GATEWOOD ADDITION',
       'GRAND VIEW ADDITION', 'GUERNSEY PARK PLACE',
       'HEMINGWAY CONDOMINIUMS', 'INGLESIDE TO OKLA CITY',
       'JEFFERSON PARK ADD', 'LAS VEGAS ADDITION',
       'LINWOOD PLACE AMENDED', 'LINWOOD PLACE SECOND',
       'LYONS WILL ROGERS PK', 'MAYFAIR HEIGHTS BLKS 1 THRU 29',
       'MILAM PLACE ADDITION', 'MILITARY PARK ADD', 'MILLERS BOULEVARD',
       'MORRIS WILL ROGER PARK', 'NEAS ADDITION', 'PACKINGHOUSE PARK',
       'PARKER & COLCORD ADD', 'PUTNAM HEIGHTS 2ND', 'PUTNAM HEIGHTS ADD',
       'RAVENSWOOD ADDITION', 'RI

In [458]:
#sales_df = pd.read_csv('sales_list_2019.csv')
sales_df = pd.DataFrame(sales)
display(sales_df)

Unnamed: 0,account_number,date,price
0,R091477265,2019-08-16,25500
1,R149911385,2019-08-16,28500
2,R154061075,2019-08-15,150000
3,R146091230,2019-08-15,2010000
4,R116954795,2019-08-14,270000
5,R066544050,2019-08-13,20000
6,R172731010,2019-08-12,25000
7,R200731340,2019-08-12,0
8,R066307030,2019-08-10,20000
9,R145481200,2019-08-09,460000


In [459]:
merged_df = pd.merge(property_df_sb, sales_df, on='account_number', how='inner')
display(merged_df)

Unnamed: 0,account_number,id,propertyid,property_type,location,building_name_occupant,city,owner_name_1,owner_name_2,billing_address_1,...,lot_depth,land_value,quarter_section_description,subdivision,block,lot,legal_description,sqft_sum,date,price
0,R049640005,4,131004,Residential,5021 N WALKER AVE,,OKLAHOMA CITY,VEITCH JAMIE PER REP,BRADSHAW TIMOTHY JOE ESTATE,5021 N WALKER AVE,...,,34216,Sect 16-T12N-R3W Qtr NW,DOUGLAS PLACE ADD,001,000,DOUGLAS PLACE ADD 001 000 LOTS 1 & 2,1352.0,2019-07-02,108000
1,R049640005,4,131004,Residential,5021 N WALKER AVE,,OKLAHOMA CITY,VEITCH JAMIE PER REP,BRADSHAW TIMOTHY JOE ESTATE,5021 N WALKER AVE,...,,34216,Sect 16-T12N-R3W Qtr NW,DOUGLAS PLACE ADD,001,000,DOUGLAS PLACE ADD 001 000 LOTS 1 & 2,1352.0,2019-05-10,108000
2,R049810175,29,130122,Residential,436 NW 43RD ST,,OKLAHOMA CITY,BELFLOWER ADAM P & ALEXANDRA,,436 NW 43RD ST,...,,41310,Sect 16-T12N-R3W Qtr NE,EDGEMERE TERRACE,001,007,EDGEMERE TERRACE 001 007,2018.0,2019-04-05,305000
3,R049640070,56,131015,Residential,5000 N LEE AVE,,OKLAHOMA CITY,JOHNSON MARK D & MOLLY JANE,,541 NW 47TH ST,...,,26320,Sect 16-T12N-R3W Qtr NW,DOUGLAS PLACE ADD,001,000,DOUGLAS PLACE ADD 001 000 LOTS 23 & 24,1429.0,2019-05-15,157000
4,R049820200,62,130383,Residential,305 NW 42ND ST,,OKLAHOMA CITY,CONIGLIONE ROBERT A,,305 NW 42ND ST,...,,87113,Sect 16-T12N-R3W Qtr SE,NOT LISTED,001,002,EDGEMERE HEIGHTS ADD 001 002,2541.0,2019-05-29,415000
5,R049810425,75,130132,Residential,415 NW 43RD ST,,OKLAHOMA CITY,CODDOU STEPHEN,CODDOU SHARON,415 NW 43RD ST,...,,38556,Sect 16-T12N-R3W Qtr NE,EDGEMERE TERRACE,002,007,EDGEMERE TERRACE 002 007,2097.0,2019-06-05,343500
6,R049810425,75,130132,Residential,415 NW 43RD ST,,OKLAHOMA CITY,CODDOU STEPHEN,CODDOU SHARON,415 NW 43RD ST,...,,38556,Sect 16-T12N-R3W Qtr NE,EDGEMERE TERRACE,002,007,EDGEMERE TERRACE 002 007,2097.0,2019-05-06,343500
7,R049810550,94,130137,Residential,412 NW 44TH ST,,OKLAHOMA CITY,PENNINGTON PARKER B,,412 NW 44TH ST,...,,33048,Sect 16-T12N-R3W Qtr NE,EDGEMERE TERRACE,002,012,EDGEMERE TERRACE 002 012,1416.0,2019-03-14,182500
8,R049820900,97,130390,Residential,300 NW 42ND ST,,OKLAHOMA CITY,CARTER JOHN B & LINDA S,,300 NW 42ND ST,...,,84911,Sect 16-T12N-R3W Qtr SE,NOT LISTED,002,001,EDGEMERE HEIGHTS ADD 002 001,2115.0,2019-07-11,361000
9,R049821000,100,130391,Residential,4211 N HARVEY PKWY,,OKLAHOMA CITY,SCHUNK JAIME LYNN,SCHUNK BRADLEY CLAYTON,4211 N HARVEY PARKWAY,...,,89249,Sect 16-T12N-R3W Qtr SE,NOT LISTED,002,002,EDGEMERE HEIGHTS ADD 002 002,2916.0,2019-05-09,510000


In [460]:
new_df = merged_df[['land_size', 'land_value', 'subdivision', 'sqft_sum', 'price']]

display(new_df)

Unnamed: 0,land_size,land_value,subdivision,sqft_sum,price
0,9147.6,34216,DOUGLAS PLACE ADD,1352.0,108000
1,9147.6,34216,DOUGLAS PLACE ADD,1352.0,108000
2,9147.6,41310,EDGEMERE TERRACE,2018.0,305000
3,6969.6,26320,DOUGLAS PLACE ADD,1429.0,157000
4,17859.6,87113,NOT LISTED,2541.0,415000
5,8276.4,38556,EDGEMERE TERRACE,2097.0,343500
6,8276.4,38556,EDGEMERE TERRACE,2097.0,343500
7,7405.2,33048,EDGEMERE TERRACE,1416.0,182500
8,15681.6,84911,NOT LISTED,2115.0,361000
9,16552.8,89249,NOT LISTED,2916.0,510000


In [461]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [462]:
le = preprocessing.LabelEncoder()
new_df['sub_encoded'] = le.fit_transform(new_df['subdivision'])
#print(X)
new_df = new_df.drop(columns=['subdivision']).dropna()
display(new_df)

X = new_df[['land_size', 'land_value', 'sub_encoded', 'sqft_sum']]
y = new_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,land_size,land_value,sqft_sum,price,sub_encoded
0,9147.6,34216,1352.0,108000,13
1,9147.6,34216,1352.0,108000,13
2,9147.6,41310,2018.0,305000,15
3,6969.6,26320,1429.0,157000,13
4,17859.6,87113,2541.0,415000,34
5,8276.4,38556,2097.0,343500,15
6,8276.4,38556,2097.0,343500,15
7,7405.2,33048,1416.0,182500,15
8,15681.6,84911,2115.0,361000,34
9,16552.8,89249,2916.0,510000,34


In [463]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)

In [464]:
knn.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [465]:
knn.score(X_test, y_test)

0.5335123443979806

In [466]:
test = knn.predict([[10000, 80000, 50, 3000]])

In [467]:
test[0]

403100.0

# Linear Regression

Now let's try this with linear regression rather than KNN.

In [468]:
from sklearn.linear_model import LinearRegression
print(X_train)

linreg = LinearRegression().fit(X_train, y_train)

     land_size  land_value  sub_encoded  sqft_sum
263     7840.8       31040           30    1414.0
833     6969.6       68740           51    2244.0
39      6534.0       30487           15    1291.0
300     6969.6       18760           16    1093.0
565     6098.4       16897           34     888.0
18     12632.4       39564           38    2267.0
320     6098.4       32455           34    1295.0
156    17424.0       35559           34    1602.0
223     7405.2       22545           28    1385.0
521     8276.4       15568           34     952.0
211        1.0        4253           34     624.0
693     8276.4       95618           57    2360.0
50      6969.6       16286           34    1241.0
689     7405.2       25050           31    1718.0
879     6534.0       24998            9    1708.0
154     6969.6       17774           34     845.0
669     7405.2       25050           34    1615.0
122     6969.6       16943           34    1361.0
893     6098.4       19860            1    1016.0


In [469]:
linreg.predict([[10000,80000,1,1000]])
linreg.predict([[10000,80000,1,3000]])
linreg.predict([[10000,80000,1,5000]])

array([571886.87315061])

In [470]:
linreg.score(X_test, y_test)

0.6306570797987696

# One Hot Encoding

Let's change the subdivision feature to one hot encoding

In [450]:
new_df = (merged_df[['land_size', 'land_value', 'subdivision', 'sqft_sum', 'price']][merged_df.price != 0]
          .dropna()
          .reset_index()
          .drop(columns=['index']))
#new_df = new_df[new_df.price != 0]

ohenc = preprocessing.OneHotEncoder()
ohenc.fit([[x] for x in new_df['subdivision']])
ohenc.categories_
encoded = ohenc.transform([[x] for x in new_df['subdivision']]).toarray()
encoded_df = pd.DataFrame(encoded, columns=["Subdivision_"+str(i) for i in range(encoded.shape[1])])
#display(encoded_df)
#new_df = pd.get_dummies(new_df, prefix=['subdivision'])

new_df = new_df.drop(columns=['subdivision'])
new_df = pd.concat([new_df, encoded_df], axis=1)
display(new_df)

Unnamed: 0,land_size,land_value,sqft_sum,price,Subdivision_0,Subdivision_1,Subdivision_2,Subdivision_3,Subdivision_4,Subdivision_5,...,Subdivision_49,Subdivision_50,Subdivision_51,Subdivision_52,Subdivision_53,Subdivision_54,Subdivision_55,Subdivision_56,Subdivision_57,Subdivision_58
0,9147.6,34216,1352.0,108000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,9147.6,34216,1352.0,108000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9147.6,41310,2018.0,305000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4791.6,15950,1626.0,168500,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9147.6,41310,2834.0,475000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6969.6,26320,1429.0,157000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6969.6,24500,1718.0,225000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,17859.6,87113,2541.0,415000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8276.4,38556,2097.0,343500,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,8276.4,38556,2097.0,343500,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [451]:
X = new_df.drop('price',axis=1)
y = new_df['price']
#display(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
linreg = LinearRegression().fit(X_train, y_train)
#display(X_test)
linreg.score(X_test, y_test)

0.5786024275710955

In [452]:
import numpy as np
ch_oh = ohenc.transform([["EDGEMERE TERRACE"]]).toarray()
ch_args = [10000, 40000, 3000] + ch_oh.tolist()[0]
ch_args = [6400, 34790, 1500] + ch_oh.tolist()[0]
#ch_args = ch_oh.tolist()
linreg.predict([ch_args])

#linreg.predict([[10000,80000,50,5000]])

# WEST END SUB DIV ADD returns really bizarre results...

array([210510.29248047])

In [453]:
[1,2,3] + [4,5,6]

[1, 2, 3, 4, 5, 6]