Notebook to automate regression analysis techniques in order to create a system that selects the best model iteratively

In [1]:
import pandas as pd

In [2]:
def load_data(data_dir, file_name):
    return pd.read_csv(data_dir + file_name)

In [3]:
def select_initial_feature(df):
    #create bivariate models using best R2
    #select best feature based on highest univar R2
    return initial_feature_name

In [4]:
def rank_all_features(df, initial_feature_name, method):
    #create feature ranking list with one of multiple methods e.g. Shapiro, Lasso, Ridge, etc.
    #force the initial_feature_name into position 1 in list
    return ranked_features_list

In [5]:
def select_next_best_feature(df, ranked_features_list, number_of_features):
    #create n models with initial feature and n next best features based on parameter 'number of features'
    #select the best model in terms of RMSE
    #return the features used by best model, and model type
    return best_features, model_type

Building Simple Scalable Solution with Spark

In [21]:
import findspark
findspark.init()

In [22]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.sql import SparkSession

In [23]:
spark = SparkSession.builder.master("local").appName("Regression Scalability")\
    .config("spark.some.config.option", "some-value")\
    .getOrCreate()

In [24]:
sc = spark.sparkContext

Applying data transformations from regressionDiagnostics nb - these might end up changing

In [43]:
import pandas as pd
import numpy as np
from itertools import combinations

In [26]:
from sklearn import preprocessing

In [27]:
listings = pd.read_csv('listings_augmented_2018-05-06_V1.csv',low_memory=False)

listings = listings.drop(columns=['listing_id','id','scrape_id','host_id',
                                  'zipcode','has_availability','license',
                                  'Unnamed: 0', 'thumbnail_url', 'medium_url',
                                  'picture_url', 'xl_picture_url', 'host_url',
                                  'host_name', 'host_thumbnail_url', 'host_picture_url',
                                  'price_x', 'amenities', 'amenities_set', 'bed_type'
                                 ])

y = listings['price_y'].fillna(listings['price_y'].mean())

X = listings.copy()
#NB: not dropping the price in this nb because it is needed by Spark's ML methods
#X = X.drop(columns='price_y')

X_num = X.select_dtypes(include=['int64','float64'])
X_num = X_num.replace([np.inf, -np.inf], np.nan)
X_num = X_num.fillna(X_num.mean())
X_1 = X_num[X_num.columns.drop(X_num[list(X_num.filter(regex='Topic'))])]
data_scaled = pd.DataFrame(preprocessing.scale(X_1),columns = X_1.columns) 

In [30]:
colData = sc.parallelize(data_scaled.columns)

In [56]:
def rSubset(row):
     
    # return list of all subsets of length r
    # to deal with duplicate subsets use 
    # set(list(combinations(arr, r)))
    combos = []
    for i in range(len(data_scaled)-1):
        combos.append(list(combinations(row, i)))
    return combos

In [58]:
colData.map(rSubset).map(lambda x: x).take(4)

KeyboardInterrupt: 

In [59]:
sc.stop()

In [12]:
sparkDF = spark.createDataFrame(data_scaled)

In [60]:
spark.stop()

In [13]:
spark.parallelize(data_scaled.columns)

AttributeError: 'SparkSession' object has no attribute 'parallelize'

In [6]:
#Had loaded spark df from file - but prefer to do data manipulation in pandas and load ultimate df
#df = spark.read.format('csv').options(header='true', inferSchema='true').load("listings_augmented_2018-05-06_V1.csv")

In [18]:
#sparkDF.printSchema()

In [32]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row

In [26]:
spark.version

u'2.3.0'

In [59]:
lr = LinearRegression(maxIter=10, regParam=0, elasticNetParam=0, fitIntercept=True)

In [43]:
data=sparkDF.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])

In [60]:
#lr.setFitIntercept(True)
lrModel = lr.fit(data)

In [61]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [0.0,0.0,-0.01025100144913379,-0.01025100144913379,0.0,-0.036342098352685176,0.05188618482296399,-0.041180604772694536,-0.0015449116526528792,0.029802264266534984,-0.01911272062653588,0.00794163501707124,0.0,0.0,0.0,0.0,0.0525472100196558,0.0,-0.013721210548706482,0.007090436182805309,0.04592967563966534,0.024952434634345842,-0.004879163416283691,-0.036614368260247596,-0.03244949374408551,0.006443624308387956,0.01622069747733493,0.03566345899732449,0.017897773134429074,0.0013042626976296112,-0.11413696688629542,0.042237831855784655,-0.08291772875319363,-0.005450675117609569,-0.09905914936818977,-0.008361658460004378,0.0050824410594768644,-0.0028965147280422252,-0.017114198746592695,-0.029193162928453028,-0.0029927522573441026,-0.022765772108482102,0.044546769802219416,0.019675542796952557,-0.018808668348632877,-0.013913001062699482,0.02401102789920411,-0.011755218148775705,0.004304520217713648,-0.01192522531513526,-0.016324236167148624,0.011019274243812183,0.0137197602989

In [None]:
n = len(data_scaled)-1
for i in range(n)
    data=sparkDF.rdd.map(lambda x:(Vectors.dense(x[0:n]), x[-1])).toDF(["features", "label"])

In [64]:
lst = ['col1', 'col2', 'col3', 'col4']

In [65]:
from itertools import combinations

In [66]:
def rSubset(arr, r):
 
    # return list of all subsets of length r
    # to deal with duplicate subsets use 
    # set(list(combinations(arr, r)))
    return list(combinations(arr, r))

In [None]:
for i in range(len(data_scaled)-1):
    print rSubset(data_scaled.columns, i)

In [None]:
import itertools

a = sc.parallelize([
    (1, [1,2,3,4]),
    (2, [3,4,5,6]),
    (3, [-1,2,3,4])
  ])

def combinations(row):
  l = row[1]
  k = row[0]
  return [(k, v) for v in itertools.combinations(l, 2)]

a.map(combinations).flatMap(lambda x: x).take(3)