Notebook to automate regression analysis techniques in order to create a system that selects the best model iteratively

In [1]:
import pandas as pd

In [2]:
def load_data(data_dir, file_name):
    return pd.read_csv(data_dir + file_name)

In [3]:
def select_initial_feature(df):
    #create bivariate models using best R2
    #select best feature based on highest univar R2
    return initial_feature_name

In [4]:
def rank_all_features(df, initial_feature_name, method):
    #create feature ranking list with one of multiple methods e.g. Shapiro, Lasso, Ridge, etc.
    #force the initial_feature_name into position 1 in list
    return ranked_features_list

In [5]:
def select_next_best_feature(df, ranked_features_list, number_of_features):
    #create n models with initial feature and n next best features based on parameter 'number of features'
    #select the best model in terms of RMSE
    #return the features used by best model, and model type
    return best_features, model_type

Building Simple Scalable Solution with Spark

In [6]:
import findspark
findspark.init()

In [7]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder.master("local").appName("Regression Scalability")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

In [9]:
#sc = spark.sparkContext

Applying data transformations from regressionDiagnostics nb - these might end up changing

In [9]:
import pandas as pd
import numpy as np
from itertools import combinations

In [12]:
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from sklearn.model_selection import KFold,cross_val_score, cross_val_predict

In [11]:
listings = pd.read_csv('listings_augmented_2018-05-06_V1.csv',low_memory=False)

listings = listings.drop(columns=['listing_id','id','scrape_id','host_id',
                                  'zipcode','has_availability','license',
                                  'Unnamed: 0', 'thumbnail_url', 'medium_url',
                                  'picture_url', 'xl_picture_url', 'host_url',
                                  'host_name', 'host_thumbnail_url', 'host_picture_url',
                                  'price_x', 'amenities', 'amenities_set', 'bed_type'
                                 ])

y = listings['price_y'].fillna(listings['price_y'].mean())

X = listings.copy()
#NB: not dropping the price in this nb because it is needed by Spark's ML methods
#X = X.drop(columns='price_y')

X_num = X.select_dtypes(include=['int64','float64'])
X_num = X_num.replace([np.inf, -np.inf], np.nan)
X_num = X_num.fillna(X_num.mean())
X_1 = X_num[X_num.columns.drop(X_num[list(X_num.filter(regex='Topic'))])]
data_scaled = pd.DataFrame(preprocessing.scale(X_1),columns = X_1.columns) 

In [20]:
"""colData = sc.parallelize(data_scaled.columns)
def rSubset(row):
     
    # return list of all subsets of length r
    # to deal with duplicate subsets use 
    # set(list(combinations(arr, r)))
    combos = []
    for i in range(len(data_scaled.columns)-1):
        combos.append(combinations(row, i))
    return combos

combos = []
for i in range(len(data_scaled.columns)):
    combos.append(list(combinations(data_scaled.columns, i)))

colData.map(rSubset).flatMap(lambda x: ''.join(x)).take(10)"""

In [13]:
R2s = []
for col in X_1.columns:
    linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
    regression_results = linear_regression.fit(data_scaled[col].values.reshape(-1, 1),y)
    #scores = cross_val_score(regression_results, data_scaled[col].values.reshape(-1, 1), y, cv=6)
    predictions = cross_val_predict(regression_results, data_scaled[col].values.reshape(-1, 1), y, cv=6)
    #accuracy = metrics.r2_score(y, predictions)
    R2s.append((col, r2_score(y, predictions)))



In [16]:
uni_r2 = pd.DataFrame(R2s, columns = ['col','R2']).sort_values(by='R2', ascending=False)
pos_uni_r2 = uni_r2[uni_r2['R2']>0]

In [84]:
pos_uni_r2

Unnamed: 0,col,R2
34,price_y,1.0
7,accommodates,0.482612
9,bedrooms,0.414702
10,beds,0.335947
8,bathrooms,0.287745
32,calculated_host_listings_count,0.081065
311,"has""Cable TV""",0.063401
16,guests_included,0.038562
294,"has""Family/Kid Friendly""",0.03829
303,"has""Pets live on this property""",0.026199


In [86]:
candidates = ['accommodates','calculated_host_listings_count','has"Cable TV"','distance from ocean']
#candidates.remove("price_y")

In [87]:
from itertools import chain, combinations
def all_subsets(ss):
    return chain(*map(lambda x: combinations(ss, x), range(2, len(ss)+1)))
combos = []
for subset in all_subsets(candidates):
    combos.append(subset)

In [88]:
outputlist = [xs + ('price_y',) for xs in combos]

In [89]:
sparkDF = spark.createDataFrame(data_scaled[pos_uni_r2.col])

In [6]:
#Had loaded spark df from file - but prefer to do data manipulation in pandas and load ultimate df
#df = spark.read.format('csv').options(header='true', inferSchema='true').load("listings_augmented_2018-05-06_V1.csv")

In [90]:
sparkDF.printSchema()

root
 |-- price_y: double (nullable = true)
 |-- accommodates: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- calculated_host_listings_count: double (nullable = true)
 |-- has"Cable TV": double (nullable = true)
 |-- guests_included: double (nullable = true)
 |-- has"Family/Kid Friendly": double (nullable = true)
 |-- has"Pets live on this property": double (nullable = true)
 |-- distance from ocean: double (nullable = true)
 |-- has"Free Parking on Premises": double (nullable = true)
 |-- has"Indoor Fireplace": double (nullable = true)
 |-- reviews_per_month: double (nullable = true)
 |-- hasDryer: double (nullable = true)
 |-- number_of_reviews: double (nullable = true)
 |-- hasWasher: double (nullable = true)
 |-- hasDog(s): double (nullable = true)
 |-- hasTV: double (nullable = true)
 |-- has"Lock on Bedroom Door": double (nullable = true)



In [102]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
spark.version

u'2.3.0'

In [101]:
def create_linear_model(data):
    lr = LinearRegression(maxIter=10, regParam=0, elasticNetParam=0, fitIntercept=True)
    #lr.setFitIntercept(True)
    lrModel = lr.fit(data)
    return lrModel

In [103]:
def print_model_results(lrModel):
    # Print the coefficients and intercept for linear regression
    print("Coefficients: %s" % str(lrModel.coefficients))
    print("Intercept: %s" % str(lrModel.intercept))

    # Summarize the model over the training set and print out some metrics
    summary = lrModel.summary
    print("numIterations: %d" % summary.totalIterations)
    print("objectiveHistory: %s" % str(summary.objectiveHistory))
    summary.residuals.show()
    print("RMSE: %f" % summary.rootMeanSquaredError)
    print("r2: %f" % summary.r2)
    print("\n")

In [104]:
for i in outputlist:
    new = sparkDF.select(list(i))
    data=new.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
    lrModel = create_linear_model(data)
    print_model_results(lrModel)

Coefficients: [0.6645713864266101,0.12671032275032956]
Intercept: 1.67385352653e-15
numIterations: 1
objectiveHistory: [0.0]
+--------------------+
|           residuals|
+--------------------+
|-0.39347095077311456|
|-0.32411329157601065|
| -0.4041749325018916|
|-0.24621588408601003|
|-0.24782817952245406|
|-0.14864472274712548|
| 0.22098337986827815|
|-0.14864472274712548|
| -1.0764681980723239|
|-0.00992940435291...|
|-0.20699104849209626|
|-0.38404905796134936|
| -0.4662600628053212|
| -0.6292266699380213|
| -0.7802606840092959|
|  0.3201096482035728|
|-0.19515950072195185|
|-0.09245620044751757|
| 0.44544234341763517|
| -0.3200063126580618|
+--------------------+
only showing top 20 rows

RMSE: 0.704175
r2: 0.504138


Coefficients: [0.6742644351544689,0.08580014138436973]
Intercept: 1.27210657941e-15
numIterations: 1
objectiveHistory: [0.0]
+--------------------+
|           residuals|
+--------------------+
| -0.3368362036343381|
| -0.2752315198651042|
| -0.5337166814830285|
|  -

In [105]:
spark.stop()