## DA 420 - Project 3
## Matt Graham

#### Imports

In [4]:
# Traditional Conjoint Analysis (Python)

# prepare for Python version 3x features and functions
from __future__ import division, print_function

# import packages for analysis and modeling
import pandas as pd  # data frame operations
import numpy as np  # arrays and math functions
from numpy import unique
import statsmodels.api as sm  # statistical models (including regression)
import statsmodels.formula.api as smf  # R-like model specification
from patsy.contrasts import Sum
from scipy.stats import uniform  # for training-and-test split
import matplotlib.pyplot as plt  # 2D plotting
import seaborn as sns

### Part 1

#### Get an clean data

In [5]:
# read in conjoint survey profiles with respondent ranks
conjoint_data_frame = pd.read_csv('mobile.csv')

conjoint_attributes = ['brand', 'startup', 'monthly', 'service',
    'retail', 'apple', 'samsung', 'google']

conjoint_data_frame

Unnamed: 0,brand,startup,monthly,service,retail,apple,samsung,google,ranking
0,AT&T,$100,$100,4G NO,Retail NO,APPLE NO,Samsung NO,Nexus NO,11
1,Verizon,$300,$100,4G NO,Retail YES,APPLE YES,Samsung YES,Nexus NO,12
2,US Cellular,$400,$200,4G NO,Retail NO,APPLE NO,Samsung YES,Nexus NO,9
3,Verizon,$400,$400,4G YES,Retail YES,APPLE NO,Samsung NO,Nexus NO,2
4,Verizon,$200,$300,4G NO,Retail NO,APPLE NO,Samsung YES,Nexus YES,8
5,Verizon,$100,$200,4G YES,Retail NO,APPLE YES,Samsung NO,Nexus YES,13
6,US Cellular,$300,$300,4G YES,Retail NO,APPLE YES,Samsung NO,Nexus NO,7
7,AT&T,$400,$300,4G NO,Retail YES,APPLE YES,Samsung NO,Nexus YES,4
8,AT&T,$200,$400,4G YES,Retail NO,APPLE YES,Samsung YES,Nexus NO,5
9,T-mobile,$400,$100,4G YES,Retail NO,APPLE YES,Samsung YES,Nexus YES,16


In [6]:
conjoint_data_frame['new_rank'] = np.random.permutation(conjoint_data_frame['ranking'].values)
conjoint_data_frame

Unnamed: 0,brand,startup,monthly,service,retail,apple,samsung,google,ranking,new_rank
0,AT&T,$100,$100,4G NO,Retail NO,APPLE NO,Samsung NO,Nexus NO,11,12
1,Verizon,$300,$100,4G NO,Retail YES,APPLE YES,Samsung YES,Nexus NO,12,8
2,US Cellular,$400,$200,4G NO,Retail NO,APPLE NO,Samsung YES,Nexus NO,9,15
3,Verizon,$400,$400,4G YES,Retail YES,APPLE NO,Samsung NO,Nexus NO,2,4
4,Verizon,$200,$300,4G NO,Retail NO,APPLE NO,Samsung YES,Nexus YES,8,6
5,Verizon,$100,$200,4G YES,Retail NO,APPLE YES,Samsung NO,Nexus YES,13,7
6,US Cellular,$300,$300,4G YES,Retail NO,APPLE YES,Samsung NO,Nexus NO,7,1
7,AT&T,$400,$300,4G NO,Retail YES,APPLE YES,Samsung NO,Nexus YES,4,11
8,AT&T,$200,$400,4G YES,Retail NO,APPLE YES,Samsung YES,Nexus NO,5,3
9,T-mobile,$400,$100,4G YES,Retail NO,APPLE YES,Samsung YES,Nexus YES,16,10


#### Sum contrasts

In [7]:
# set up sum contrasts for effects coding as needed for conjoint analysis
# using C(effect, Sum) notation within main effects model specification
main_effects_model = 'new_rank ~ C(brand, Sum) + C(startup, Sum) +  \
    C(monthly, Sum) + C(service, Sum) + C(retail, Sum) + C(apple, Sum) + \
    C(samsung, Sum) + C(google, Sum)'

# fit linear regression model using main effects only (no interaction terms)
main_effects_model_fit = smf.ols(main_effects_model, data = conjoint_data_frame).fit()
main_effects_model_fit.summary()



0,1,2,3
Dep. Variable:,new_rank,R-squared:,0.993
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,10.72
Date:,"Fri, 27 Jan 2023",Prob (F-statistic):,0.235
Time:,16:26:15,Log-Likelihood:,-7.0097
No. Observations:,16,AIC:,44.02
Df Residuals:,1,BIC:,55.61
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,8.5000,0.375,22.667,0.028,3.735,13.265
"C(brand, Sum)[S.AT&T]",2.0000,0.650,3.079,0.200,-6.253,10.253
"C(brand, Sum)[S.T-mobile]",3.0000,0.650,4.619,0.136,-5.253,11.253
"C(brand, Sum)[S.US Cellular]",-2.7500,0.650,-4.234,0.148,-11.003,5.503
"C(startup, Sum)[S.$100 ]",-2.22e-15,0.650,-3.42e-15,1.000,-8.253,8.253
"C(startup, Sum)[S.$200 ]",-1.5000,0.650,-2.309,0.260,-9.753,6.753
"C(startup, Sum)[S.$300 ]",4.663e-15,0.650,7.18e-15,1.000,-8.253,8.253
"C(monthly, Sum)[S.$100 ]",0.2500,0.650,0.385,0.766,-8.003,8.503
"C(monthly, Sum)[S.$200 ]",4.5000,0.650,6.928,0.091,-3.753,12.753

0,1,2,3
Omnibus:,29.718,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2.667
Skew:,0.0,Prob(JB):,0.264
Kurtosis:,1.0,Cond. No.,2.0


#### Indexing parameter coefficients

In [8]:
# build part-worth information one attribute at a time
level_name = []
part_worth = []
part_worth_range = []
end = 1  # initialize index for coefficient in params
for item in conjoint_attributes:
    nlevels = len(list(unique(conjoint_data_frame[item])))
    level_name.append(list(unique(conjoint_data_frame[item]))) 
    begin = end 
    end = begin + nlevels - 1
    new_part_worth = list(main_effects_model_fit.params[begin:end])
    new_part_worth.append((-1) * sum(new_part_worth))  
    part_worth_range.append(max(new_part_worth) - min(new_part_worth))  
    part_worth.append(new_part_worth)   
    # end set to begin next iteration

pd.DataFrame(part_worth_range)

Unnamed: 0,0
0,5.75
1,3.0
2,8.5
3,2.25
4,1.25
5,3.0
6,1.25
7,0.5


#### Compute and report relative importance

In [9]:
# compute attribute relative importance values from ranges
attribute_importance = []
for item in part_worth_range:
    attribute_importance.append(round(100 * (item / sum(part_worth_range)),2))
    
# user-defined dictionary for printing descriptive attribute names     
effect_name_dict = {'brand' : 'Mobile Service Provider', \
    'startup' : 'Start-up Cost', 'monthly' : 'Monthly Cost', \
    'service' : 'Offers 4G Service', 'retail' : 'Has Nearby Retail Store', \
    'apple' : 'Sells Apple Products', 'samsung' : 'Sells Samsung Products', \
    'google' : 'Sells Google/Nexus Products'}  
 
# report conjoint measures to console 
index = 0  # initialize for use in for-loop
for item in conjoint_attributes:
    print('\nAttribute:', effect_name_dict[item])
    print('    Importance:', attribute_importance[index])
    print('    Level Part-Worths')
    for level in range(len(level_name[index])):
        print('       ',level_name[index][level], part_worth[index][level])       
    index = index + 1


Attribute: Mobile Service Provider
    Importance: 22.55
    Level Part-Worths
        AT&T 2.0000000000000018
        T-mobile 2.999999999999999
        US Cellular -2.7500000000000013
        Verizon -2.2499999999999996

Attribute: Start-up Cost
    Importance: 11.76
    Level Part-Worths
        $100  -2.220446049250313e-15
        $200  -1.5
        $300  4.6629367034256575e-15
        $400  1.4999999999999976

Attribute: Monthly Cost
    Importance: 33.33
    Level Part-Worths
        $100  0.25000000000000056
        $200  4.499999999999998
        $300  -0.7499999999999978
        $400  -4.000000000000002

Attribute: Offers 4G Service
    Importance: 8.82
    Level Part-Worths
        4G NO 1.1250000000000004
        4G YES -1.1250000000000004

Attribute: Has Nearby Retail Store
    Importance: 4.9
    Level Part-Worths
        Retail NO -0.6249999999999996
        Retail YES 0.6249999999999996

Attribute: Sells Apple Products
    Importance: 11.76
    Level Part-Worths
       

#### Interpretation

After shuffling our rankings, we can conclude the most important attributes are: Mobile service provider, monthly cost, start-up cost, and has a nearby retail store. 

Within those main attributes, the most-ranked would be: A Verizon phone start-up cost ~$100 and a monthly fee of around the same, $100.

#### Spine chart attempt

In [10]:
effect_df = pd.concat([pd.Series(effect_name_dict.keys()), pd.Series(attribute_importance)], axis=1)
effect_names = pd.DataFrame(effect_name_dict, index=range(1)).T
effect_names = pd.concat([effect_names, pd.Series(level_name, index=effect_names.index)], axis = 1, ignore_index=True)
effect_names = effect_names.rename(columns={0: 'attribute', 1: 'levels'})

effect_names

Unnamed: 0,attribute,levels
brand,Mobile Service Provider,"[AT&T, T-mobile, US Cellular, Verizon]"
startup,Start-up Cost,"[$100 , $200 , $300 , $400 ]"
monthly,Monthly Cost,"[$100 , $200 , $300 , $400 ]"
service,Offers 4G Service,"[4G NO, 4G YES]"
retail,Has Nearby Retail Store,"[Retail NO, Retail YES]"
apple,Sells Apple Products,"[APPLE NO, APPLE YES]"
samsung,Sells Samsung Products,"[Samsung NO, Samsung YES]"
google,Sells Google/Nexus Products,"[Nexus NO, Nexus YES]"


In [None]:
## Not sure where to go from here??

### Part 2

In [17]:
# import package for analysis and modeling
from rpy2.robjects import r   # interface from Python to R

r('library(arules)')  # association rules
r('library(arulesViz)')  # data visualization of association rules
r('library(RColorBrewer)')  # color palettes for plots

r('data(Groceries)')  # grocery transactions object from arules package

# show the dimensions of the transactions object
r('print(dim(Groceries))')

r('print(dim(Groceries)[1])')  # 9835 market baskets for shopping trips
r('print(dim(Groceries)[2])')  # 169 initial store items  

# examine frequency for each item with support greater than 0.025
r('png(file="fig_market_basket_initial_item_support.png", \
    width = 600, height = 800)')
r('itemFrequencyPlot(Groceries, support = 0.025, \
    cex.names=0.8, xlim = c(0,0.3), \
    type = "relative", horiz = TRUE, col = "dark red", las = 1, \
    xlab = paste("Proportion of Market Baskets Containing Item", \
      "\n(Item Relative Frequency or Support)"))')
r('dev.off()')    

# explore possibilities for combining similar items
r('print(head(itemInfo(Groceries)))') 
r('print(levels(itemInfo(Groceries)[["level1"]]))')  # 10 levels... too few 
r('print(levels(itemInfo(Groceries)[["level2"]]))')  # 55 distinct levels

# aggregate items using the 55 level2 levels for food categories
# to create a more meaningful set of items
r('groceries <- aggregate(Groceries, itemInfo(Groceries)[["level2"]])')  

r('print(dim(groceries)[1])')  # 9835 market baskets for shopping trips
r('print(dim(groceries)[2])')  # 55 final store items (categories)  

r('png(file="fig_market_basket_final_item_support.png", \
      width = 600, height = 800)')
r('itemFrequencyPlot(groceries, support = 0.025, \
       cex.names=1.0, xlim = c(0,0.5),\
       type = "relative", horiz = TRUE, col = "blue", las = 1,\
       xlab = paste("Proportion of Market Baskets Containing Item",\
    "\n(Item Relative Frequency or Support)"))')
r('dev.off()')   

# obtain large set of association rules for items by category and all shoppers
# this is done by setting very low criteria for support and confidence
r('first.rules <- apriori(groceries, \
       parameter = list(support = 0.001, confidence = 0.05))')
r('print(summary(first.rules))')  # yields 69,921 rules... too many

# select association rules using thresholds for support and confidence 
r('second.rules <- apriori(groceries, \
       parameter = list(support = 0.025, confidence = 0.05))')
r('print(summary(second.rules))')  # yields 344 rules
  
# data visualization of association rules in scatter plot
r('png(file="fig_market_basket_rules.png", width = 800, height = 800)')
r('plot(second.rules, \
       control=list(jitter=2, col = rev(brewer.pal(9, "Greens")[4:9])), \
  shading = "lift")')   
r('dev.off()')    
  
# grouped matrix of rules 
r('png(file="fig_market_basket_rules_matrix.png", \
       width = 800, height = 800)')
r('plot(second.rules, method="grouped", \
       control=list(col = rev(brewer.pal(9, "Greens")[4:9])))')
r('dev.off()')    

# select rules with vegetables in consequent (right-hand-side) item subsets
r('fruit.rules <- subset(second.rules, subset = rhs %pin% "fruit")')
r('fruit.rules <- head(sort(fruit.rules, by="confidence", decreasing=TRUE),10)')
r('inspect(fruit.rules)')  # 41 rules

# sort by lift and identify the top 10 rules
r('top.fruit.rules <- head(sort(fruit.rules, \
       decreasing = TRUE, by = "lift"), 10)')
r('inspect(top.fruit.rules)') 

r('png(file="fig_market_basket_farmer_rules.png", width = 800, height = 600)')
r('plot(top.fruit.rules, method="graph", \
       control=list(type="items"), \
       shading = "lift")')
r('dev.off()')  

# Suggestions for the student:
# Suppose your client is someone other than the local farmer,
# a meat producer/butcher, dairy, or brewer perhaps.
# Determine association rules relevant to your client's products
# guided by the market basket model. What recommendations
# would you make about future marketplace actions?

[1] 9835  169
[1] 9835
[1] 169
             labels  level2           level1
1       frankfurter sausage meat and sausage
2           sausage sausage meat and sausage
3        liver loaf sausage meat and sausage
4               ham sausage meat and sausage
5              meat sausage meat and sausage
6 finished products sausage meat and sausage
 [1] "canned food"          "detergent"            "drinks"              
 [4] "fresh products"       "fruit and vegetables" "meat and sausage"    
 [7] "non-food"             "perfumery"            "processed food"      
[10] "snacks and candies"  
 [1] "baby food"                       "bags"                           
 [3] "bakery improver"                 "bathroom cleaner"               
 [5] "beef"                            "beer"                           
 [7] "bread and backed goods"          "candy"                          
 [9] "canned fish"                     "canned fruit/vegetables"        
[11] "cheese"                          

R[write to console]:  Unknown control parameters: type



Available control parameters (with default values):
layout	 =  stress
circular	 =  FALSE
ggraphdots	 =  NULL
edges	 =  <environment>
nodes	 =  <environment>
nodetext	 =  <environment>
colors	 =  c("#EE0000FF", "#EEEEEEFF")
engine	 =  ggplot2
max	 =  100
verbose	 =  FALSE


0
1


#### Interpretation

This was a bit difficult to implement - the visuals did not produce as expected. However, we can conclude that folks who purchase fruit are most likely to have dairy produce, bread/baked goods, vegetables, and non-alcoholic beverages. 

I am including the visuals that compiled without error as well.