Meredith Synnott


In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
import matplotlib.pyplot as plt

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# load data and preview
DATA_URL = 'http://128.138.93.164/aprd6342/data/finalmaster-ratios.csv'
df = pd.read_csv(DATA_URL)

In [0]:
# List of all the predictors you're going to feed into the LassoLarsCV model. 
allvariablenames = list(df.columns.values)

# First 8 variables aren't valid predictors 
listofallpredictors = allvariablenames[8:]

#load predictors into dataframe
predictors = df[listofallpredictors]  

#load target into dataframe
target = df['# Purchases']   

# split data into train and test sets, with 30% retained for test
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, target, test_size=.3, random_state=123)    

# Fitting
model = LassoLarsCV(precompute=False, cv=10)
model.fit(pred_train, tar_train)

# create a table called predictors_model that loading all predictors name from listofallpredictors.
predictors_model = pd.DataFrame(listofallpredictors)

# rename the column name of predictors_model to 'label'.
predictors_model.columns = ['label']

# add column called 'coeff' and append all coefficents from regression model.
predictors_model['coeff'] = model.coef_

# for loop that go through predictor_model table and print out the coefficent with name that larger than zero.   
for index, row in predictors_model.iterrows():
    if row['coeff'] > 0:
        print(row.values)



['B01001014' 0.8558761066941788]
['B01001036' 2.5053482381631653]
['B01001037' 0.8892493223320962]
['B01001038' 1.5316387928880384]
['B02001005' 0.41252295298457853]
['B13014026' 0.48004105312075906]
['B13014027' 0.6978957445987839]
['B13016001' 875149895.329212]
['B19001017' 1.4834348068681533]


In [0]:
 #Question 2:
# ['B01001014' 0.8557908775529921] Males aged 40 to 44 Years.
# If there is one more Males aged 40 to 44 Years, we will sell 0. 8557908775529921 unit more Bobo Bars.

# ['B01001036' 2.505392496591849] Females aged 30 to 34 Years.
# If there is one more Females aged 30 to 34 Years, we will sell 2.505392496591849 unit more Bobo Bars.
        
# ['B01001037' 0.8894214357013622] Females aged 35 to 39 Years.
# If there is one more Females aged 35 to 39 Years, we will sell 0.8894214357013622 unit more Bobo Bars.

# ['B01001038' 1.5315839680821497] Females aged 40 to 44 Years.
# If there is one more Females aged 40 to 44 Years, we will sell 1.5315839680821497 unit more Bobo Bars.
        
# ['B02001005' 0.4125408937426837] Asian Alone
# If there is one more Asian Alone, we will sell 0.4125408937426837 unit more Bobo Bars.
        
# ['B13014026' 0.4800240326923769] Women 15 to 50 Years Who Had a Birth in the Past 12 Months with Bachelor's Degree
# If there is one more Women 15 to 50 Years Who Had a Birth in the Past 12 Months with Bachelor's Degree, we will sell 0.4800240326923769 unit more Bobo Bars.

# ['B13014027' 0.6977454940063235] Women 15 to 50 Years Who Had a Birth in the Past 12 Months with Graduate or Professional Degree
# If there is one more Women 15 to 50 Years Who Had a Birth in the Past 12 Months with Graduate or Professional Degree, we will sell 0.6977454940063235 unit more Bobo Bars.

# ['B13016001' 874922971.7249781] Women 15 to 50 Years Who Had a Birth in the Past 12 Months
# If there is one more Women 15 to 50 Years Who Had a Birth in the Past 12 Months, we will sell 874922971.7249781 unit more Bobo Bars.

# ['B19001017' 1.4834465563617387] Household with income $200,000 or More.
# If there is one more Household with income $200,000 or More, we will sell 1.4834465563617387 unit more Bobo Bars.


In [0]:
# Question 3:
# If I had to report only two census variables to my boss that most steeply predicted sales, the first one would be 
# Women 15 to 50 Years Who Had a Birth in the Past 12 Months and the second one would be Females aged 30 to 34 Years.

In [0]:
# mean squared error 
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(tar_train, model.predict(pred_train))
print ('training data MSE')
print(train_error)

test_error = mean_squared_error(tar_test, model.predict(pred_test))
print ('testing data MSE')
print(test_error)

training data MSE
22025.491066757
testing data MSE
41549.54803776253


In [0]:
# Question 4:    
# The MSE for training data = 22025.312777378716 and testing data = 41549.12573000182
# The mean squared error is the measurement of how close a fitted line is to the data points. The smaller MSE is, the closer the fits is to the data. 
#Since the model is regressed by training data, the MSE for training data has a less value, which means fitting better than the testing data..

In [0]:
#r squared
rsquared_train = model.score(pred_train, tar_train)
print ('training data R-square')
print(rsquared_train)

rsquared_test = model.score(pred_test, tar_test)
print ('testing data R-square')
print(rsquared_test)

# The R-squared for training data is 0.24002827375880997 and the R-squared for testing data is 
# 0.17587122769388464. Through comparing the R-squared value, we could know that the training 
# data set has a better regression model as it has a larger R-squared value.

# Question 5: Census data is not a good fit of predicting sales amount. Even though training data would show
# a better fit than the actual testing data, the MSE for training data set is still a bit large and 
# the R-squared value of training data set is too small. 
# In both training date set and the testing data set, the MSE and R-squared value are not sufficient enough to make prediction about the sales.

training data R-square
0.2400221219784492
testing data R-square
0.1758628512005107


In [0]:
print("y interecept:")
print(model.intercept_)
# Question 6: The intercept tells us that the baseline sales number = 22.194697684317433. This means when the value of every
# variable is 0, no 40 to 44 years old women or anything, there are 22.194697684317433 Bobo bars will be sold.

y interecept:
22.19738813257551
