In [113]:
###Import the numpy and pandas 
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

import matplotlib as mpl
import matplotlib.pyplot as plt
##import seaborn as sns

In [35]:
%matplotlib inline

In [47]:
### Assuming that we have the train and the test dataset in the working directory
train_set=pd.read_csv("train_set.csv")
test_set=pd.read_csv("test_set.csv")

In [48]:
### Function which cleans the train and test dataset
def dataset_cleaning(df):
    RemoveColumns=['registrationcontactid','buildingid', 'housenumber', 'streetname', 'boro', 'zip', 'firstname', 'lastname', 'corporationname']

    df=df.drop(RemoveColumns, axis=1)
    ### Replace the NaN to Zero
    df=df.fillna(0)

    ### Create new DataFrames for the Categorical Variables
    registered_dummies=pd.get_dummies(df['registered'])
    recordstatus_dummies=pd.get_dummies(df['recordstatus'])
    contactdescription_dummies=pd.get_dummies(df['contactdescription'])

    ### Give some meaningful names to the dummy columns
    registered_dummies.columns=['No_Registered', 'Registered']
    recordstatus_dummies.columns=['Status_Active', 'Status_Inactive']
    contactdescription_dummies.columns=['Contract_COOP', 'Contract_CONDO']

    ### Remove the columns from the dataset
    df=df.drop(['registered', 'recordstatus', 'contactdescription' ], axis=1)

    ### Add the Dummy Columns
    df=pd.concat([df,registered_dummies, recordstatus_dummies, contactdescription_dummies], axis=1 )
    ### Remove the Reference Level
    df=df.drop(['Status_Inactive', 'No_Registered', 'Contract_COOP' ], axis=1)
    return df
    

In [117]:
### Load the train and test datasets and returning the clean Train and Test Datase using the function dataset_cleaning
train=dataset_cleaning(train_set)
test=dataset_cleaning(test_set)

In [136]:
#Create a Logistic Regression Class object
logreg=LogisticRegression()

### Run the Logistic Regression Model
X_train=train.drop(['churned'], axis=1)
Y_train=train.churned
Y_train=np.ravel(Y_train)  ##TRANSFORM TO AN 1D array
X_test=test.drop(['churned'], axis=1)


#Train the model with the training set
logreg.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [138]:
### Create as an Output the following columns Actual, Predicted and Probability for every row in the dataset
### The predicted is defined as "1" if the probability is higher than the top 5%
### Finally it returns the output in a csv file
Y_pred=logreg.predict(X_test)
Y_probs=DataFrame(logreg.predict_proba(X_test))[1]
Threshold=Y_probs.quantile(0.95)
output=DataFrame({'Predicted':Y_pred, 'Probability':Y_probs, 'Actual': test.churned})
output.loc[output.Probability>Threshold, 'Predicted']=1

In [123]:
output.to_csv("output.csv", sep=";")

In [139]:
###And the Confusion Matrix
cnf_matrix =DataFrame(confusion_matrix(output.Actual, output.Predicted))
cnf_matrix.columns=["Predicted 0", "Predicted 1"]
cnf_matrix.index=["Actual 0", "Actual 1"]
cnf_matrix

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,2145,239
Actual 1,21,2
