# AMC Theatre Analytics

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from patsy import dmatrices

#import decisiontreeclassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
#import logisticregression classifier
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
#import knn classifier
from sklearn.neighbors import KNeighborsClassifier

#for validating your classification model
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Loading Data

In [9]:
df = pd.read_csv("AMCSampleData_InsiderPremier.csv")
print df.head()

   ISPREMIER  AGE21PLUSINDICATOR  AGE18TO21INDICATOR  AGE13TO18INDICATOR  \
0          1                   1                   0                   0   
1          1                   1                   0                   0   
2          1                   1                   0                   0   
3          0                   1                   0                   0   
4          1                   1                   0                   0   

   GENDERCODE   BIRTHDATE  AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR  \
0           3   1/13/1976                                        0   
1           1    7/6/1983                                        0   
2           3  10/20/1970                                        0   
3           0   7/10/1967                                        0   
4           1    8/1/1994                                        1   

   AMCSTUBSREWARDSEMAILOPTININDICATOR  AMCSTUBSREWARDSMOBILEOPTININDICATOR  \
0                                   0       

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47168 entries, 0 to 47167
Data columns (total 24 columns):
ISPREMIER                                   47168 non-null int64
AGE21PLUSINDICATOR                          47168 non-null int64
AGE18TO21INDICATOR                          47168 non-null int64
AGE13TO18INDICATOR                          47168 non-null int64
GENDERCODE                                  47168 non-null int64
BIRTHDATE                                   47168 non-null object
AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR     47168 non-null int64
AMCSTUBSREWARDSEMAILOPTININDICATOR          47168 non-null int64
AMCSTUBSREWARDSMOBILEOPTININDICATOR         47168 non-null int64
AMCSTUBSREWARDSSMSOPTININDICATOR            47168 non-null int64
AMCSTUBSTHEATREMOBILEOFFEROPTININDICATOR    47168 non-null int64
AMCSTUBSTHEATRESMSOFFEROPTININDICATOR       47168 non-null int64
LIFETIMEAMCSTUBSAWARDSEARNEDAMOUNT          47168 non-null float64
LIFETIMEAMCSTUBSSPENDAMOUNT                

In [11]:
# drop unnecessary columns
df = df.drop(['BIRTHDATE'], axis = 1)

In [12]:
# correlation 
df.corr()

Unnamed: 0,ISPREMIER,AGE21PLUSINDICATOR,AGE18TO21INDICATOR,AGE13TO18INDICATOR,GENDERCODE,AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR,AMCSTUBSREWARDSEMAILOPTININDICATOR,AMCSTUBSREWARDSMOBILEOPTININDICATOR,AMCSTUBSREWARDSSMSOPTININDICATOR,AMCSTUBSTHEATREMOBILEOFFEROPTININDICATOR,...,ONLINEFEESWAIVEDAMOUNT,ESTIMATEDINCOMERANGE,ESTIMATEDHOMEMARKETVALUERANGE,CHILDRENPRESENCE,HASFREQUENTVISITTHEATRENUMBER,TICKETQTYSUM,TICKETBALANCEDUESUM,CONCQTYSUM,CONCLOYDISCOUNTAMTSUM,CONCBALANCEDUESUM
ISPREMIER,1.0,0.061112,-0.055834,-0.055062,-0.259107,0.01767,0.016929,0.096219,-0.002351,0.101791,...,0.065082,0.131807,0.136758,-0.109527,-0.095963,0.454074,0.448898,0.33703,-0.336218,0.346307
AGE21PLUSINDICATOR,0.061112,1.0,-0.752175,-0.575614,-0.045974,-0.021198,-0.011316,-0.01891,-0.00826,-0.01847,...,0.014899,0.066996,0.053564,-0.054983,0.03675,0.070456,0.06497,0.048383,-0.030221,0.050361
AGE18TO21INDICATOR,-0.055834,-0.752175,1.0,-0.050069,0.031229,-0.000344,-0.003268,-0.006016,-0.001702,-0.006367,...,-0.014371,-0.049905,-0.038279,0.043359,-0.012676,-0.05217,-0.044477,-0.044913,0.02861,-0.045631
AGE13TO18INDICATOR,-0.055062,-0.575614,-0.050069,1.0,0.038517,0.031067,0.021269,0.031597,-0.001304,0.031809,...,-0.013728,-0.053198,-0.04429,0.044753,-0.039047,-0.066338,-0.062233,-0.039085,0.026098,-0.040643
GENDERCODE,-0.259107,-0.045974,0.031229,0.038517,1.0,0.049828,0.015141,0.288459,0.003246,0.275386,...,-0.033293,-0.211743,-0.243629,0.198341,0.042094,-0.161999,-0.157961,-0.030185,0.097442,-0.02559
AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR,0.01767,-0.021198,-0.000344,0.031067,0.049828,1.0,0.790356,0.417696,0.003084,0.421257,...,-0.031632,-0.062426,-0.085104,0.049842,-0.035628,-0.055879,-0.091406,0.123013,-0.038158,0.127483
AMCSTUBSREWARDSEMAILOPTININDICATOR,0.016929,-0.011316,-0.003268,0.021269,0.015141,0.790356,1.0,0.346564,0.002637,0.350151,...,-0.029093,-0.061998,-0.078715,0.050123,-0.021071,-0.050431,-0.077944,0.101356,-0.02991,0.10516
AMCSTUBSREWARDSMOBILEOPTININDICATOR,0.096219,-0.01891,-0.006016,0.031597,0.288459,0.417696,0.346564,1.0,-4e-05,0.981348,...,-0.033784,-0.201097,-0.220956,0.187706,0.038848,-0.003603,-0.031016,0.216142,-0.06473,0.224316
AMCSTUBSREWARDSSMSOPTININDICATOR,-0.002351,-0.00826,-0.001702,-0.001304,0.003246,0.003084,0.002637,-4e-05,1.0,-0.006617,...,-2e-05,0.015693,0.017376,-0.013487,0.002147,-0.002186,-0.002947,-0.003012,0.001265,-0.003093
AMCSTUBSTHEATREMOBILEOFFEROPTININDICATOR,0.101791,-0.01847,-0.006367,0.031809,0.275386,0.421257,0.350151,0.981348,-0.006617,1.0,...,-0.033195,-0.196742,-0.216951,0.183563,0.038253,-0.000434,-0.029483,0.217107,-0.067576,0.224778


# Model Building

In [13]:
# declare X variables and Y variables
y = df['ISPREMIER']
X = df.drop(['ISPREMIER'], axis=1)

# Recursive Feature Selection (RFE)

In [21]:
model = LogisticRegression()
rfe = RFE(model, 10)  #asking 10 best attributes
rfe = rfe.fit(X, y)
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True False  True False  True False False False
 False False False False  True  True False False  True False]
[ 1  1  1  1  1  2  1 11  1 12  9 10 13  5  6  3  1  1  7  4  1  8]


In [22]:
# features sorted by their rank
pd.DataFrame({'feature':X.columns, 'importance':rfe.ranking_})

Unnamed: 0,feature,importance
0,AGE21PLUSINDICATOR,1
1,AGE18TO21INDICATOR,1
2,AGE13TO18INDICATOR,1
3,GENDERCODE,1
4,AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR,1
5,AMCSTUBSREWARDSEMAILOPTININDICATOR,2
6,AMCSTUBSREWARDSMOBILEOPTININDICATOR,1
7,AMCSTUBSREWARDSSMSOPTININDICATOR,11
8,AMCSTUBSTHEATREMOBILEOFFEROPTININDICATOR,1
9,AMCSTUBSTHEATRESMSOFFEROPTININDICATOR,12


In [23]:
# select 7 most significant features only - not selecting age variables
X_logistic = df[['GENDERCODE', 'AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR', 'AMCSTUBSREWARDSMOBILEOPTININDICATOR','AMCSTUBSTHEATREMOBILEOFFEROPTININDICATOR','HASFREQUENTVISITTHEATRENUMBER','TICKETQTYSUM','CONCLOYDISCOUNTAMTSUM']]
print X_logistic.head()

   GENDERCODE  AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR  \
0           3                                        0   
1           1                                        0   
2           3                                        0   
3           0                                        0   
4           1                                        1   

   AMCSTUBSREWARDSMOBILEOPTININDICATOR  \
0                                    0   
1                                    1   
2                                    0   
3                                    1   
4                                    1   

   AMCSTUBSTHEATREMOBILEOFFEROPTININDICATOR  HASFREQUENTVISITTHEATRENUMBER  \
0                                         0                              1   
1                                         1                              0   
2                                         0                              1   
3                                         0                              1   
4       

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_logistic, y, test_size=0.3, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)

#Model evaluation
print metrics.accuracy_score(y_test, lr.predict(X_test))
print metrics.confusion_matrix(y_test, lr.predict(X_test))
print metrics.classification_report(y_test, lr.predict(X_test))
print metrics.roc_auc_score(y_test, lr.predict(X_test))

0.923892304431
[[12287   241]
 [  836   787]]
             precision    recall  f1-score   support

          0       0.94      0.98      0.96     12528
          1       0.77      0.48      0.59      1623

avg / total       0.92      0.92      0.92     14151

0.73283379426


> **92.39% model accuracy**
- **0.733 AUC score; AUC = Areaa Under Curve**
    - TPR = (true positives/total positives) = 0.94
    - FPR = (false positives/total negatives) = 0.77

# Logistic Regression

In [27]:
# evaluate the model by splitting into train and test sets (split validation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
#You just did split valiation. 
#Report the following model evaluation metrics: overall model accuracy, confusion matrix, AUC score, classification report
print metrics.accuracy_score(y_test, lr.predict(X_test))
print metrics.confusion_matrix(y_test, lr.predict(X_test))
print metrics.classification_report(y_test, lr.predict(X_test))
print metrics.roc_auc_score(y_test, lr.predict(X_test))

0.927920288319
[[12296   232]
 [  788   835]]
             precision    recall  f1-score   support

          0       0.94      0.98      0.96     12528
          1       0.78      0.51      0.62      1623

avg / total       0.92      0.93      0.92     14151

0.747980420346


> **92.8% model accuracy**
- **0.748 AUC score; AUC = Area Under Curve**
    - TPR = (true positives/total positives) = 0.94
    - FPR = (false positives/total negatives) = 0.78

## Logit Model

In [29]:
# instantiate a logit model, and fit with X_lr and y_lr
# this is like a statistical model, rather than a data mining model ...
logit = sm.Logit(y, X) 
model_logit = logit.fit()


         Current function value: 0.198089
         Iterations: 35




In [34]:
print model_logit.predict()

[ 1.          1.          0.99999993 ...,  0.07482706  0.01434115
  0.08737244]


In [30]:
# logit summary
print model_logit.summary()

                           Logit Regression Results                           
Dep. Variable:              ISPREMIER   No. Observations:                47168
Model:                          Logit   Df Residuals:                    47147
Method:                           MLE   Df Model:                           20
Date:                Fri, 07 Jul 2017   Pseudo R-squ.:                  0.4460
Time:                        19:07:12   Log-Likelihood:                -9343.5
converged:                      False   LL-Null:                       -16866.
                                        LLR p-value:                     0.000
                                               coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------
AGE21PLUSINDICATOR                          -2.0198        nan        nan        nan         nan         nan
AGE18TO21INDICATOR                       

  return np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [31]:
# logit pred_table()
model_logit.pred_table()

array([[ 40943.,    784.],
       [  2639.,   2802.]])

In [32]:
# look at the confidence interval of each coeffecient
print model_logit.conf_int()

                                                     0             1
AGE21PLUSINDICATOR                                 NaN           NaN
AGE18TO21INDICATOR                       -2.746688e+00 -2.609367e+00
AGE13TO18INDICATOR                       -3.345860e+00 -3.110885e+00
GENDERCODE                               -7.689131e-01 -6.984431e-01
AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR  -4.098909e-01 -8.777185e-02
AMCSTUBSREWARDSEMAILOPTININDICATOR       -6.534532e-02  2.951747e-01
AMCSTUBSREWARDSMOBILEOPTININDICATOR       3.582846e-01  9.388133e-01
AMCSTUBSREWARDSSMSOPTININDICATOR         -1.290225e+20  1.290225e+20
AMCSTUBSTHEATREMOBILEOFFEROPTININDICATOR  5.993992e-01  1.178814e+00
AMCSTUBSTHEATRESMSOFFEROPTININDICATOR    -1.290225e+20  1.290225e+20
LIFETIMEAMCSTUBSAWARDSEARNEDAMOUNT        7.543243e-02  1.075617e-01
LIFETIMEAMCSTUBSSPENDAMOUNT              -1.109674e-02 -7.716727e-03
ONLINEFEESWAIVEDAMOUNT                   -3.430720e-03 -6.780029e-04
ESTIMATEDINCOMERANGE              

In [33]:
# odds ratios only
print np.exp(model_logit.params)

AGE21PLUSINDICATOR                          0.132682
AGE18TO21INDICATOR                          0.068699
AGE13TO18INDICATOR                          0.039622
GENDERCODE                                  0.480140
AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR     0.779711
AMCSTUBSREWARDSEMAILOPTININDICATOR          1.121778
AMCSTUBSREWARDSMOBILEOPTININDICATOR         1.912763
AMCSTUBSREWARDSSMSOPTININDICATOR            0.000118
AMCSTUBSTHEATREMOBILEOFFEROPTININDICATOR    2.432955
AMCSTUBSTHEATRESMSOFFEROPTININDICATOR       0.000118
LIFETIMEAMCSTUBSAWARDSEARNEDAMOUNT          1.095814
LIFETIMEAMCSTUBSSPENDAMOUNT                 0.990637
ONLINEFEESWAIVEDAMOUNT                      0.997948
ESTIMATEDINCOMERANGE                        1.007856
ESTIMATEDHOMEMARKETVALUERANGE               0.967235
CHILDRENPRESENCE                            1.042399
HASFREQUENTVISITTHEATRENUMBER               0.061855
TICKETQTYSUM                                1.159409
TICKETBALANCEDUESUM                         1.

## Build Stats Model

In [38]:
#import packages 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#regression packages
import sklearn.linear_model as lm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
import statsmodels.formula.api as sm

#lasso regression
from sklearn import linear_model

#f_regression (feature selection)
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

# recursive feature selection (feature selection)
from sklearn.feature_selection import RFE

In [39]:
#build model with 7 significant variables identified in feature selection
reg_model = sm.ols("ISPREMIER~GENDERCODE+AMCSTUBSLOYALTYEMAILOFFEROPTININDICATOR+AMCSTUBSREWARDSMOBILEOPTININDICATOR+AMCSTUBSTHEATREMOBILEOFFEROPTININDICATOR+HASFREQUENTVISITTHEATRENUMBER+TICKETQTYSUM+CONCLOYDISCOUNTAMTSUM", df)
runs_model = reg_model.fit()
print runs_model.summary()

                            OLS Regression Results                            
Dep. Variable:              ISPREMIER   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                     3812.
Date:                Fri, 07 Jul 2017   Prob (F-statistic):               0.00
Time:                        19:24:57   Log-Likelihood:                -2526.2
No. Observations:               47168   AIC:                             5068.
Df Residuals:                   47160   BIC:                             5139.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------