Notebook to automate regression analysis techniques in order to create a system that selects the best model iteratively

In [1]:
import pandas as pd

In [2]:
def load_data(data_dir, file_name):
    return pd.read_csv(data_dir + file_name)

In [3]:
def select_initial_feature(df):
    #create bivariate models using best R2
    #select best feature based on highest univar R2
    return initial_feature_name

In [4]:
def rank_all_features(df, initial_feature_name, method):
    #create feature ranking list with one of multiple methods e.g. Shapiro, Lasso, Ridge, etc.
    #force the initial_feature_name into position 1 in list
    return ranked_features_list

In [5]:
def select_next_best_feature(df, ranked_features_list, number_of_features):
    #create n models with initial feature and n next best features based on parameter 'number of features'
    #select the best model in terms of RMSE
    #return the features used by best model, and model type
    return best_features, model_type

Building Simple Scalable Solution with Spark

In [6]:
import findspark
findspark.init()

In [7]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder.master("local").appName("Regression Scalability")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

In [9]:
#sc = spark.sparkContext

Applying data transformations from regressionDiagnostics nb - these might end up changing

In [10]:
import pandas as pd
import numpy as np
from itertools import combinations

In [11]:
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from sklearn.model_selection import KFold,cross_val_score, cross_val_predict

In [12]:
listings = pd.read_csv('listings_augmented_2018-05-06_V1.csv',low_memory=False)

listings = listings.drop(columns=['listing_id','id','scrape_id','host_id',
                                  'zipcode','has_availability','license',
                                  'Unnamed: 0', 'thumbnail_url', 'medium_url',
                                  'picture_url', 'xl_picture_url', 'host_url',
                                  'host_name', 'host_thumbnail_url', 'host_picture_url',
                                  'price_x', 'amenities', 'amenities_set', 'bed_type'
                                 ])

y = listings['price_y'].fillna(listings['price_y'].mean())

X = listings.copy()
#NB: not dropping the price in this nb because it is needed by Spark's ML methods
#X = X.drop(columns='price_y')

X_num = X.select_dtypes(include=['int64','float64'])
X_num = X_num.replace([np.inf, -np.inf], np.nan)
X_num = X_num.fillna(X_num.mean())
X_1 = X_num[X_num.columns.drop(X_num[list(X_num.filter(regex='Topic'))])]
data_scaled = pd.DataFrame(preprocessing.scale(X_1),columns = X_1.columns) 

In [13]:
"""colData = sc.parallelize(data_scaled.columns)
def rSubset(row):
     
    # return list of all subsets of length r
    # to deal with duplicate subsets use 
    # set(list(combinations(arr, r)))
    combos = []
    for i in range(len(data_scaled.columns)-1):
        combos.append(combinations(row, i))
    return combos

combos = []
for i in range(len(data_scaled.columns)):
    combos.append(list(combinations(data_scaled.columns, i)))

colData.map(rSubset).flatMap(lambda x: ''.join(x)).take(10)"""

"colData = sc.parallelize(data_scaled.columns)\ndef rSubset(row):\n     \n    # return list of all subsets of length r\n    # to deal with duplicate subsets use \n    # set(list(combinations(arr, r)))\n    combos = []\n    for i in range(len(data_scaled.columns)-1):\n        combos.append(combinations(row, i))\n    return combos\n\ncombos = []\nfor i in range(len(data_scaled.columns)):\n    combos.append(list(combinations(data_scaled.columns, i)))\n\ncolData.map(rSubset).flatMap(lambda x: ''.join(x)).take(10)"

In [14]:
R2s = []
for col in X_1.columns:
    linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
    regression_results = linear_regression.fit(data_scaled[col].values.reshape(-1, 1),y)
    #scores = cross_val_score(regression_results, data_scaled[col].values.reshape(-1, 1), y, cv=6)
    predictions = cross_val_predict(regression_results, data_scaled[col].values.reshape(-1, 1), y, cv=6)
    #accuracy = metrics.r2_score(y, predictions)
    R2s.append((col, r2_score(y, predictions)))



In [15]:
uni_r2 = pd.DataFrame(R2s, columns = ['col','R2']).sort_values(by='R2', ascending=False)
pos_uni_r2 = uni_r2[uni_r2['R2']>0]

In [16]:
pos_uni_r2

Unnamed: 0,col,R2
34,price_y,1.0
7,accommodates,0.482612
9,bedrooms,0.414702
10,beds,0.335947
8,bathrooms,0.287745
32,calculated_host_listings_count,0.081065
311,"has""Cable TV""",0.063401
16,guests_included,0.038562
294,"has""Family/Kid Friendly""",0.03829
303,"has""Pets live on this property""",0.026199


In [17]:
candidates = ['accommodates','calculated_host_listings_count','has"Cable TV"','distance from ocean']
#candidates.remove("price_y")

In [18]:
from itertools import chain, combinations
def all_subsets(ss):
    return chain(*map(lambda x: combinations(ss, x), range(2, len(ss)+1)))
combos = []
for subset in all_subsets(candidates):
    combos.append(subset)

In [19]:
outputlist = [xs + ('price_y',) for xs in combos]

In [20]:
sparkDF = spark.createDataFrame(data_scaled[pos_uni_r2.col])

In [21]:
#Had loaded spark df from file - but prefer to do data manipulation in pandas and load ultimate df
#df = spark.read.format('csv').options(header='true', inferSchema='true').load("listings_augmented_2018-05-06_V1.csv")

In [22]:
sparkDF.printSchema()

root
 |-- price_y: double (nullable = true)
 |-- accommodates: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- calculated_host_listings_count: double (nullable = true)
 |-- has"Cable TV": double (nullable = true)
 |-- guests_included: double (nullable = true)
 |-- has"Family/Kid Friendly": double (nullable = true)
 |-- has"Pets live on this property": double (nullable = true)
 |-- distance from ocean: double (nullable = true)
 |-- has"Free Parking on Premises": double (nullable = true)
 |-- has"Indoor Fireplace": double (nullable = true)
 |-- reviews_per_month: double (nullable = true)
 |-- hasDryer: double (nullable = true)
 |-- number_of_reviews: double (nullable = true)
 |-- hasWasher: double (nullable = true)
 |-- hasDog(s): double (nullable = true)
 |-- hasTV: double (nullable = true)
 |-- has"Lock on Bedroom Door": double (nullable = true)



In [49]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.ml.evaluation import RegressionEvaluator
spark.version

u'2.3.0'

In [32]:
def split_data(data, train_size):
    (trainingData, testData) = data.randomSplit([train_size, 1-train_size])
    return trainingData, testData

In [44]:
def create_linear_regression(data):
    lr = LinearRegression(maxIter=10, regParam=0, elasticNetParam=0, fitIntercept=True)
    #lr.setFitIntercept(True)
    lrModel = lr.fit(data)
    return lrModel

In [50]:
def create_random_forest_regression(data):
    rf = RandomForestRegressor()
    rfModel = rf.fit(trainingData)
    return rfModel

In [51]:
def create_grad_boosted_regression(data):
    gbt = GBTRegressor(maxIter=10)
    gbtModel = gbt.fit(data)
    return gbtModel 

In [53]:
#Not sure if I'll do this one
def create_non_linear_regression(data):
    return nlrModel

In [54]:
def print_model_results(model, predictions):
    # Print the coefficients and intercept for linear regression
    print("\n")
    print("Coefficients: %s" % str(model.coefficients))
    print("Intercept: %s" % str(model.intercept))
    # Summarize the model over the training set and print out some metrics
    summary = model.summary
    print("numIterations: %d" % summary.totalIterations)
    print("objectiveHistory: %s" % str(summary.objectiveHistory))
    summary.residuals.show()
    print("Root Mean Squared Error (RMSE) on training data: %f" % summary.rootMeanSquaredError)
    print("r2: %f" % summary.r2)
    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
    print("\n")

Modeling: linear regression, gradient boosted tree regression, random forest regression

In [55]:
def compute_linear_regression(outputlist):
    for i in outputlist:
        #preparing data
        new = sparkDF.select(list(i))
        data = new.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
        trainingData, testData = split_data(data, 0.72)
        #modeling
        lrModel = create_linear_regression(trainingData)
        predictions = lrModel.transform(testData)
        print 'Model Results for Features: ',i
        print_model_results(lrModel, predictions)
        #saving models
        #enter code here

In [56]:
compute_linear_regression(outputlist)

Model Results for Features:  ('accommodates', 'calculated_host_listings_count', 'price_y')


Coefficients: [0.6787470449767404,0.12695115878981109]
Intercept: -0.0026744128793
numIterations: 1
objectiveHistory: [0.0]
+--------------------+
|           residuals|
+--------------------+
|-0.28680884363733516|
| -0.2727277730615102|
|-0.25805999121169243|
| -0.2572385954281028|
| -0.2560065017527181|
| -0.2560065017527181|
|-0.24368556499887128|
| -0.2430605899461401|
|-0.22605978047600706|
|-0.22520415986810105|
|-0.22520415986810105|
|-0.22520415986810105|
|-0.21935693564593672|
|-0.21912925355196833|
|-0.19522321376707386|
|-0.19440181798348422|
|-0.19440181798348422|
|-0.19440181798348422|
|-0.19440181798348422|
|-0.19440181798348422|
+--------------------+
only showing top 20 rows

Root Mean Squared Error (RMSE) on training data: 0.697193
r2: 0.518815
Root Mean Squared Error (RMSE) on test data = 0.722618


Model Results for Features:  ('accommodates', 'has"Cable TV"', 'price_y')


C

Root Mean Squared Error (RMSE) on test data = 0.687102


Model Results for Features:  ('calculated_host_listings_count', 'has"Cable TV"', 'distance from ocean', 'price_y')


Coefficients: [0.2559191843610393,0.200032855919787,-0.11065249838837653]
Intercept: 0.00767954931799
numIterations: 1
objectiveHistory: [0.0]
+-------------------+
|          residuals|
+-------------------+
|-1.1558786438333741|
| -0.895254448307939|
| -0.836111319210681|
|-0.8327941439307993|
|-0.8176729942783509|
|-0.8081522704231057|
|-0.8035319191404131|
|-0.8019918020461823|
|-0.8019918020461823|
|-0.7840704394951323|
|-0.7650289917846418|
|-0.7403871182769484|
|-0.7403871182769484|
|-0.7309961603852968|
|-0.7302663488005743|
|-0.7272497339068826|
|-0.7157452447692547|
|-0.6911033712615611|
|-0.6849429028846377|
|-0.6846711175150675|
+-------------------+
only showing top 20 rows

Root Mean Squared Error (RMSE) on training data: 0.921187
r2: 0.157244
Root Mean Squared Error (RMSE) on test data = 0.907963


M

In [57]:
def compute_random_forest_regression(outputlist):
    for i in outputlist:
        #preparing data
        new = sparkDF.select(list(i))
        data = new.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
        trainingData, testData = split_data(data, 0.72)
        #modeling
        rfModel = create_linear_regression(trainingData)
        predictions = rfModel.transform(testData)
        print 'Model Results for Features: ',i
        print_model_results(rfModel, predictions)

In [58]:
compute_random_forest_regression(outputlist)

Model Results for Features:  ('accommodates', 'calculated_host_listings_count', 'price_y')


Coefficients: [0.6709838826934929,0.11213122735951804]
Intercept: 0.00843424675875
numIterations: 1
objectiveHistory: [0.0]
+--------------------+
|           residuals|
+--------------------+
|-0.22938069173691678|
| -0.3123542836921839|
|-0.28360543126654114|
| -0.2815519418075668|
| -0.2815519418075668|
|   -0.26923100505372|
| -0.2686060300009888|
| -0.2617959570126057|
|-0.26101704721782204|
|-0.25074959992294976|
|-0.25074959992294976|
|-0.25074959992294976|
|-0.25074959992294976|
|-0.24490237570078544|
|-0.24467469360681704|
|-0.24390463505970172|
| -0.2285719137660257|
|-0.21994725803833293|
|-0.21994725803833293|
|-0.21994725803833293|
+--------------------+
only showing top 20 rows

Root Mean Squared Error (RMSE) on training data: 0.703072
r2: 0.501042
Root Mean Squared Error (RMSE) on test data = 0.707764


Model Results for Features:  ('accommodates', 'has"Cable TV"', 'price_y')


C

Root Mean Squared Error (RMSE) on test data = 0.722908


Model Results for Features:  ('calculated_host_listings_count', 'has"Cable TV"', 'distance from ocean', 'price_y')


Coefficients: [0.2308128891940059,0.20747951773655696,-0.11775901416452436]
Intercept: -0.021692871494
numIterations: 1
objectiveHistory: [0.0]
+-------------------+
|          residuals|
+-------------------+
|-1.1375913136399458|
|-0.8701767089725316|
|-0.8444481972927975|
|-0.8146469315194305|
|-0.7995257818669823|
|-0.7853847067290443|
|-0.7838445896348134|
|-0.7838445896348134|
|-0.7659232270837637|
|-0.7468817793732732|
|-0.7270174119538058|
|-0.7222399058655795|
|-0.7222399058655795|
| -0.712848947973928|
| -0.709102521495514|
|-0.6852770956040393|
|-0.6729561588501924|
|-0.6675821332447911|
| -0.666795690473269|
|-0.6665239051036989|
+-------------------+
only showing top 20 rows

Root Mean Squared Error (RMSE) on training data: 0.889157
r2: 0.155010
Root Mean Squared Error (RMSE) on test data = 0.985586




In [61]:
#Will see if going to do this, I need to edit the model evalaution method or create a new one
def compute_grad_boosted_regression(outputlist):
    for i in outputlist:
        #preparing data
        new = sparkDF.select(list(i))
        data = new.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
        trainingData, testData = split_data(data, 0.72)
        #modeling
        gbtModel = create_grad_boosted_regression(trainingData)
        predictions = gbtModel.transform(testData)
        print 'Model Results for Features: ',i
        #Model results will probably have to change
        print_model_results(gbtModel, predictions)

In [63]:
#compute_grad_boosted_regression(outputlist)

In [None]:
#Additional Modeling Techniques:
#Do lasso lars and or ridge for feature selection by just modifying lienar model parameter (see docs)

In [None]:
#Store Model Results in DB?
#Saving models not going to work - need to extract interesting fields from the evaluation method and store them somehow

In [None]:
#Plot Results of all models to see what works and what doesnt

In [None]:
#plot performance of the jobs or use spark UI (maybe)

In [64]:
spark.stop()