In [3]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
from scipy import stats

for pd_option in ['display.max_rows', 'display.max_colwidth', 'display.max_columns']:
    pd.set_option(pd_option, 500)  # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.set_option.html

In [4]:
from IPython.core.pylabtools import figsize
from matplotlib import pyplot as plt
import seaborn as sns
from ggplot import *
%matplotlib inline
figsize(16, 6)

In [22]:

# Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, Normalizer, Imputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.kernel_approximation import RBFSampler
from sklearn.cluster import KMeans

# Models
import statsmodels.api as sm 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier # SGDClassifier(loss="log", penalty="elasticnet")
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.feature_selection import RFECV, chi2, f_classif, SelectFromModel, SelectPercentile, SelectKBest, VarianceThreshold

from sklearn.ensemble.partial_dependence import plot_partial_dependence
#features = [0, 1, (0, 1)]
#fig, axs = plot_partial_dependence(clf, X, features)

In [6]:
#Split values based on the two CSVs
training = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
testing = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
training.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C


In [8]:
#Convert categorical values to integers
train_mini = training.copy()
train_mini.loc[train_mini['Embarked'] == 'C', 'Embarked'] = 1
train_mini.loc[train_mini['Embarked'] == 'Q', 'Embarked'] = 2
train_mini.loc[train_mini['Embarked'] == 'S', 'Embarked'] = 3

train_mini.loc[train_mini['Sex'] == 'male', 'Sex'] = 1
train_mini.loc[train_mini['Sex'] == 'female', 'Sex'] = 2

In [9]:
train_features = train_mini[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare']]
target = train_mini['Survived']

In [12]:
train_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
Pclass    891 non-null int64
Sex       891 non-null object
SibSp     891 non-null int64
Parch     891 non-null int64
Fare      891 non-null float64
dtypes: float64(1), int64(3), object(1)
memory usage: 34.9+ KB


In [17]:
features = train_features.copy()

In [18]:
logit_model = sm.Logit(target, features)

result = logit_model.fit(maxiter=50) 
print result.summary(), '\n\n'
cm = result.pred_table()
cm_df = pd.DataFrame(data=cm, columns=['churn_pred', 'renew_pred'], index=['churn', 'renew'])
print cm_df, '\n\n', (cm[[0]][0][0] + cm[[1]][0][1] * 1.0) / sum(sum(cm))

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [19]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
print "Training count {0}, testing count {1}".format(str(X_train.shape), str(X_test.shape))
print "Overall renewal rate from data {}, renewal rate in training {}, renewal rate in test {}".format(
    str(int(target.sum() * 1.0/target.shape[0]*100)), 
    str(int(y_train.sum()* 1.0/y_train.shape[0]*100)), 
    str(int(y_test.sum()* 1.0/y_test.shape[0]*100))
)

Training count (596, 5), testing count (295, 5)
Overall renewal rate from data 38, renewal rate in training 37, renewal rate in test 40


In [20]:
def get_model(model, feature_data, target_data):
    X_train, X_test, features = feature_data
    y_train, y_test, target = target_data
    model.fit(X_train, y_train)
    cm = confusion_matrix(y_test, model.predict(X_test))
    cm_df = pd.DataFrame(data=cm, columns=['churn_pred', 'renew_pred'], index=['churn', 'renew'])
    print cm_df, '\n', (cm[[0]][0][0] + cm[[1]][0][1] * 1.0) / sum(sum(cm))
    scores = cross_val_score(model, features, target, cv=5)
    print np.mean(scores), '\n', scores
    return model

In [None]:
#######################################################################################################################
# logistic regression model
print "Logistic Regression Model\n"

log_model = LogisticRegression(class_weight={0.0: 0.6, 1.0: 0.4}, C=1, fit_intercept=True)
log_model = get_model(log_model, [X_train, X_test, features], [y_train, y_test, target])


print '\n--------------------------------------------\n'

#######################################################################################################################
print "KNN Classifier"

knn_model = KNeighborsClassifier(n_neighbors=7, weights='distance', p=2)
knn_model = get_model(knn_model, [X_train, X_test, features], [y_train, y_test, target])

print '\n--------------------------------------------\n'

#######################################################################################################################
print "MLP Classifier"
mlp_model = MLPClassifier(hidden_layer_sizes=(1000, ), activation='logistic')
mlp_model = get_model(mlp_model, [X_train, X_test, features], [y_train, y_test, target])

print '\n--------------------------------------------\n'

#######################################################################################################################

print "RandomForest Classifier"
rf_model = RandomForestClassifier(
    class_weight={0.0: 0.6, 1.0: 0.4},
    max_features=None,
    max_depth=None, 
    min_samples_split=10, 
    min_samples_leaf=10, 
    n_estimators=1000
)
rf_model = get_model(rf_model, [X_train, X_test, features], [y_train, y_test, target])

print '\n--------------------------------------------\n'

#######################################################################################################################

print "Naive Bayes"
naive_model = GaussianNB()
naive_model = get_model(naive_model, [X_train, X_test, features], [y_train, y_test, target])

print '\n--------------------------------------------\n'

#######################################################################################################################

print "SVM Model"
svm_model = SVC(kernel='rbf', probability=True)
svm_model = get_model(svm_model, [X_train, X_test, features], [y_train, y_test, target])

print '\n--------------------------------------------\n'



#######################################################################################################################
print "Voting Classifier\n"
vote_model = VotingClassifier(
    estimators=[('log', log_model), ('knn', knn_model)], 
    voting='soft', 
    weights=[1, 1]
)

vote_model = get_model(vote_model, [X_train, X_test, features], [y_train, y_test, target])

Logistic Regression Model

       churn_pred  renew_pred
churn         170           5
renew          62          58 
0.772881355932
0.781210389295 
[ 0.74301676  0.79329609  0.78089888  0.7752809   0.81355932]

--------------------------------------------

KNN Classifier
       churn_pred  renew_pred
churn         145          30
renew          39          81 
0.766101694915
0.774512401114 
[ 0.73743017  0.73184358  0.81460674  0.80337079  0.78531073]

--------------------------------------------

MLP Classifier
       churn_pred  renew_pred
churn         136          39
renew          21          99 
0.796610169492
0.757683647737 
[ 0.66480447  0.7877095   0.76404494  0.78089888  0.79096045]

--------------------------------------------

RandomForest Classifier


In [None]:
categorical_columns = ['has_expansion', 'has_companygoal', 'has_employeegoal', 'has_subgoal', 'has_scheduled_report',
                       'has_champion', 'using_skilljar', 'has_core_features']
anova_filter = SelectKBest(f_classif, k=10)


feature_pipeline = Pipeline([
    ('imputer', Imputer(strategy='median', axis=0)),
    ('scaler', StandardScaler()),
    ('anova_filter', anova_filter)
])

features = feature_pipeline.fit_transform(X=train_features, y=target)

cols = [retention_features.columns[idx] for idx in feature_pipeline.named_steps['anova_filter'].get_support(indices=True)]
features = pd.DataFrame(features, columns=cols)

# not certain that I need to do this but I think that it makes sense, encode 
for col in cols:
    if col in categorical_columns:
        min_val = features[col].min()
        features[col] = features[col].apply(lambda x: 0 if x == min_val else 1)
        
print features.shape