# Run just the linear regression, as a benchmark

We will run just the linear regression, against each of the created pre-processed data sets.

### let's take care of the imports/functions to get running...

In [25]:

import os,sys,time,random,math,time
import tarfile, zipfile

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit

import itertools

import matplotlib.pyplot as plt
from IPython.display import display, Image

from subprocess import check_output
datadir="./input/"
cachedir="./cache/"
outdir='./output/'

print(check_output(["ls", datadir]).decode("utf8"))
print(check_output(["ls", cachedir]).decode("utf8"))
print(check_output(["ls", outdir]).decode("utf8"))


%matplotlib inline  

data_cats.csv
data_conts.csv
data_new.csv
data_orig_only.csv
test.csv
test.csv.zip
test_data_cats.csv
test_data_conts.csv
test_data_data_new.csv
test_data_orig_only.csv
train.csv
train.csv.zip

clusters_cat.npy
clusters_cat.npy_01.npy
clusters_cat.npy_02.npy
clusters_cont.npy
clusters_cont.npy_01.npy
clusters_cont.npy_02.npy
clusters.npy
clusters.npy_01.npy
clusters.npy_02.npy
grid_L2_KNN.pkl
grid_L2_Lin.pkl
grid_regr0.pkl
grid_regr1.pkl
grid_regr2.pkl
grid_regr3.pkl
MAE_tracking.npy
oldmodels
x_layer2.npy
x_layer2.npy_01.npy
x_layer2_w_clusters.npy
x_layer2_w_clusters.npy_01.npy

result_submission_orig_only_linear.csv
result_submission_stack_linear.csv
result_submission_stack_xgb.csv



In [26]:
def loadData(datadir,filename):
    # Load the wholesale customers dataset
    #data = pd.read_csv(filename)
    data = ''
    print ("loading: "+datadir+filename)
    try:
        if zipfile.is_zipfile(datadir+filename):
            z = zipfile.ZipFile(datadir+filename)
            filename = z.open(filename[:-4])
        else:
            filename=datadir+filename
        data = pd.read_csv(filename, parse_dates=True)  
        print ("Dataset has {} samples with {} features each.".format(*data.shape))
    except Exception as e:
        print ("Dataset could not be loaded. Is the dataset missing?")
        print(e)
    return data

def writeData(data,filename):
    # Load the wholesale customers dataset
    try:
        data.to_csv(filename, index=False)
    except Exception as e:
        print ("Dataset could not be written.")
        print(e)
    verify=[]
    try:
        with open(filename, 'r') as f:
            for line in f:
                verify.append(line)
        f.closed
        return verify[:5]
    except IOError:
        sys.std
        

In [27]:
def grid_search_wrapper(x,y,regr,param,regr_name='BLANK'):
    start_time = time.time()
    print("In:{}".format(regr))
    filename= 'grid_{}.pkl'.format(regr_name)
    if os.path.isfile(cachedir+filename):
        print filename," exists, importing "
        return joblib.load(cachedir+filename) 
    else:
        print("{} not present, running a gridsearch".format(filename))
        #search the param_grid for best params based on the f1 score
        grid_search = GridSearchCV(regr,
                                   param_grid= param,
                                   n_jobs= -1,
                                   scoring=make_scorer(mean_absolute_error,greater_is_better=False)) 
        print("debug 1")
        grid_search.fit(x,y)
        print "debug2"
        #reach into the grid search and pull out the best parameters, and set those on the clf
        params={}
        for p in grid_search.best_params_:
            params[p]=grid_search.best_params_[p]
        regr.set_params(**params)
        print("run time:{}s".format(round((time.time()-start_time), 3) ))   
        joblib.dump(regr,cachedir+filename) 
    return regr

In [37]:
def prepdata(data_name,verbose=False):
    ### and now, let's import the data
    data = loadData(datadir,'data_'+data_name+'.csv')
    if verbose==True:
        display(data.info())
        display(data.head(2))

    test_data= loadData(datadir,'test_data_'+data_name+'.csv') 
    if verbose==True:
        display(test_data.info())
        display(test_data.head(2))
    # we don't want the ID columns in X
    x=data.drop(['id','loss'],1).values
    # loss is our label
    #y=data['loss'].values
    shift=200
    y = np.log(data['loss']+shift).ravel()

    return x,y,test_data




In [39]:
  def predict_for_dataset(data_name):
    #for the current data, load and prep     
    x,y,test_data=prepdata(data_name)
    
    #  train/validation split
    X_train, X_validation, y_train, y_validation = train_test_split( x,
                                                                    y,
                                                                   test_size=0.20,
                                                                    random_state=42)
    display("sample train data size:{}".format(len(y_train)))

    #set up our regression
    estimator=LinearRegression(n_jobs=-1)
    
    #train the estimator
    start_time = time.time()
    estimator.fit(X_train,y_train)
    fit_time=time.time()-start_time
    
    #test on the validation set
    start_time = time.time()
    curr_predict=np.array(estimator.predict(X_validation)).copy()
    predict_time=time.time()-start_time

    #track the run info
    MAE=np.mean(abs(np.exp(curr_predict) - np.exp(y_validation)))

    #show some stats on that last regressions run
    print("\nfit time:{}s".format(round(fit_time, 3) ))
    print("Mean abs error: {:.2f}".format(MAE))
    print("predict time:{}s".format(round(predict_time, 3) ))


    #the Final Prediction on the test data
    x_test_data=test_data.drop(['loss','id'],1) .values# didn't have the loss column before, make it go away! don't need ID!

    start_time = time.time()
    test_data['loss']=np.exp(estimator.predict(x_test_data))-200
    final_predict_time=time.time()-start_time
    del x_test_data

    result=test_data[['id','loss',]]
    output_fname='result_submission_'+data_name+'_linear.csv'
    print("\nFinal predict time:{}\nfinal sample".format(final_predict_time))
    display(writeData(result,outdir+output_fname))
    return ["run:{}".format(data_name),MAE,start_time,predict_time,final_predict_time]



In [None]:
stat_tracking=[]
data_name='orig_only'
stat_tracking.append(predict_for_dataset(data_name))


loading: ./input/data_orig_only.csv
Dataset has 188318 samples with 132 features each.
loading: ./input/test_data_orig_only.csv


In [None]:
### TODO: wrap the above in a function and run for each data set, and plot times, etc.

### EOL