In [45]:
import os
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from IPython.display import Image
%matplotlib inline
plt.style.use('ggplot')

from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.datasets import make_classification;

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 20)
pd.set_option('display.notebook_repr_html', True)

# Loading dataset
df = pd.read_csv(os.path.join('..', 'dataset', 'dataset-13-walget-train.csv'))
# df = df.set_index('business_id')




In [46]:
df.describe

<bound method DataFrame.describe of     AccountHolderImpliedGender AccountHolderAddress  \
0                       Female                 Home   
1                       Female               PO Box   
2                         Male            Apartment   
3                         Male                 Home   
4                       Female               PO Box   
5                       Female                 Home   
6                         Male            Apartment   
7                         Male            Apartment   
8                       Female            Apartment   
9                       Female            Apartment   
..                         ...                  ...   
590                       Male            Apartment   
591                       Male                 Home   
592                       Male            Apartment   
593                       Male                 Home   
594                       Male                 Home   
595                       Mal

Some Notes
Since the model is trying to classify between pregnant and not pregnant, we are planning to use a logistic regresssion
In terms of features, the features that we thought were most relevant are "RecentlyPurchasedPregnancyTest", "RecentlyPurchasedFolicAcidSupplements", "RecentlyPurchasedPrenatalVitamins", "RecentlyPurchasedMaternityClothing"

In [47]:
df.columns

Index([u'AccountHolderImpliedGender', u'AccountHolderAddress',
       u'RecentlyPurchasedPregnancyTest', u'RecentlyPurchasedBirthControl',
       u'RecentlyPurchasedFeminineHygieneProducts',
       u'RecentlyPurchasedFolicAcidSupplements',
       u'RecentlyPurchasedPrenatalVitamins',
       u'RecentlyPurchasedPrenatalYogaDVD', u'RecentlyPurchasedBodyPillow',
       u'RecentlyPurchasedGingerAle', u'RecentlyPurchasedSeaBands',
       u'PurchasedCigarettesRegularlyUntilRecentlyThenStopped',
       u'RecentlyPurchasedCigarettes',
       u'RecentlyPurchasedSmokingCessationProducts',
       u'PurchasedWineRegularlyUntilRecentlyThenStopped',
       u'RecentlyPurchasedWine', u'RecentlyPurchasedMaternityClothing',
       u'IsPregnant'],
      dtype='object')

In [48]:
df2 = df[['RecentlyPurchasedPregnancyTest', 'RecentlyPurchasedFolicAcidSupplements', 'RecentlyPurchasedPrenatalVitamins', 'RecentlyPurchasedMaternityClothing', 'IsPregnant']]
df2

Unnamed: 0,RecentlyPurchasedPregnancyTest,RecentlyPurchasedFolicAcidSupplements,RecentlyPurchasedPrenatalVitamins,RecentlyPurchasedMaternityClothing,IsPregnant
0,False,False,True,False,True
1,False,False,False,False,True
2,False,False,False,False,False
3,False,False,True,False,True
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,True,False,False
7,False,False,True,False,True
8,True,True,True,False,True
9,False,False,False,False,True


In [49]:
# Drop all nulls since is a small number
df2 = df2.dropna()
len(df2)

584

In [50]:
df2=df2*1
df2.head()

Unnamed: 0,RecentlyPurchasedPregnancyTest,RecentlyPurchasedFolicAcidSupplements,RecentlyPurchasedPrenatalVitamins,RecentlyPurchasedMaternityClothing,IsPregnant
0,0,0,1,0,1
1,0,0,0,0,1
2,0,0,0,0,0
3,0,0,1,0,1
4,0,0,0,0,0


In [51]:
df2.sum()

RecentlyPurchasedPregnancyTest            45.0
RecentlyPurchasedFolicAcidSupplements     65.0
RecentlyPurchasedPrenatalVitamins         82.0
RecentlyPurchasedMaternityClothing        77.0
IsPregnant                               293.0
dtype: float64

In [52]:
#Create X and y variables
X = df2.drop("IsPregnant", axis=1)
y = df2.IsPregnant

#Intialize, fit, and score the model
lr = LogisticRegression()

lr.fit(X,y)

score = lr.score(X,y)

print ("The model produces an accuracy score of {:.2f} percent".format(score*100))

The model produces an accuracy score of 75.68 percent


In [53]:
#Step 1
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.4,
                                                    random_state=42)
#Step 2
lr = LogisticRegression()
lr.fit(X_train, y_train)

#Step 3
preds = lr.predict(X_test)

#Step 4
testing_score = accuracy_score(y_test, preds)

print ("The model accurately classified {:.2f} percent of the testing data".format(testing_score*100))

The model accurately classified 76.07 percent of the testing data
