# LASSO Regression for Machine Learning
## Lynn Mast
### Created for Digital Advertising APRD6342-001
#### The goal for this exercise is to identify the best predictors of sales among a large data set of variables.

In [2]:
#libraries
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV
import matplotlib.pyplot as plt

#Removes warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

#data is available in github repository
alldata = pd.read_csv('finalmaster-ratios.csv')

#Creates a list of all predictors from the data set
allvariablenames = list(alldata.columns.values)

#Remove first 8 columns as they are not predictors in the data set
del allvariablenames[0:7]

#Load predictors
predictors = alldata[allvariablenames]

#Load target
target = alldata['# Purchases']

#Split data into 70% train and 30% test with random selection
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.3, random_state=123)  

#Build LASSO model with 10 fold cross vlaidation
model = LassoLarsCV(cv=10, precompute=False).fit(pred_train, tar_train)

#Create a coefficient chart for output   
predictors_model=pd.DataFrame(allvariablenames) 
predictors_model.columns = ['label'] 
predictors_model['coeff'] = model.coef_ 

#For loop to iterate through the predictors_model data frame finding significant 
#coefficients greater than 0 and printing them
for index, row in predictors_model.iterrows():
    if row['coeff'] > 0:
        print(row.values)

['B01001014' 0.7975260265569238]
['B01001036' 2.4975987120275827]
['B01001037' 1.5798278110427166]
['B01001038' 1.6358596519844413]
['B02001005' 0.49093615792653633]
['B13014016' 0.009433958279257113]
['B13014026' 0.4955798244683253]
['B13014027' 0.33105609299067307]
['B15002015' 0.04187704921035053]
['B15002027' 0.9363990047138775]
['B19001017' 1.4673678524497002]
