In [60]:
# Importing necessary libraries

import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import Imputer
import sklearn
import pandas
import random
import matplotlib.pyplot as plt
%matplotlib inline  

In [61]:
# Reading in file 

raw_data = pandas.read_csv('contactid_08_01_18download.csv', encoding="utf-8")
#raw_data

# This will show us (# of rows, # of columns)
raw_data.shape

(18265, 36)

In [62]:
# We only want to consider data in which the "Close" column has data, so that is the first thing we filter and we train/test on that new DataFrame

dfclean = raw_data.loc[raw_data['close'].notnull()]
#dfclean

# This will show us (# of rows, # of columns)
dfclean.shape

(2869, 36)

In [63]:
# Now we look over the workable data and see how clean it is

num_of_cells = dfclean.shape[0]*dfclean.shape[1]
num_of_cells

103284

In [64]:
# We look at all the columns with NaN values

df_null_columns = ((dfclean.isnull().sum()/dfclean.shape[0])*100).round(decimals=2).astype(str) + '%'
df_null_columns.to_frame('nulls')
df_null_columns

Subject.Id                                            0.0%
Contact.ID                                            0.0%
cat                                                   0.0%
svy_contact                                           0.0%
empstatus                                            9.27%
jailprison                                          10.46%
close                                                 0.0%
baseline_flag                                         0.0%
dup                                                   0.0%
reinstall_flag1                                       0.0%
reinstall_flag2                                       0.0%
phonesurvey                                           0.0%
Total number of calls                                 0.0%
Total duration of calls                               0.0%
Total incoming calls                                  0.0%
Total outgoing calls                                  0.0%
Total number of missed calls                          0.

In [65]:
# We also look at all the rows with at least one NaN value

df_null_rows = dfclean.isnull().sum(axis=1)
df_null_rows

2        4
17       4
24       0
28       3
33       2
36       2
46       2
52       2
61       0
67       2
68       2
76       0
87       0
88       0
99       4
101      2
103      2
106      4
107      0
113      0
114      0
116      2
119      2
120      0
121      4
123      4
125      2
126      0
127      0
128      0
        ..
18014    7
18015    7
18017    5
18038    7
18045    5
18058    7
18059    9
18062    9
18063    5
18071    7
18078    7
18095    5
18106    5
18135    9
18136    5
18144    6
18145    5
18169    9
18171    5
18174    8
18177    7
18190    7
18198    5
18205    9
18209    5
18220    8
18233    9
18237    5
18252    5
18254    5
Length: 2869, dtype: int64

In [66]:
# The percentage of rows missing at least one value would be

(((df_null_rows!=0).sum()/(df_null_rows.shape[0]))*100).round(decimals=2).astype(str) + '%'

'78.81%'

In [67]:
# And lastly we consider missing cells in relation to all total cells 

num_of_missing_cells = (dfclean.isnull().sum()).sum()
((num_of_missing_cells/num_of_cells)*100).round(decimals=2).astype(str) + '%'

'11.32%'

In [68]:
#Now we begin to train/test the data, in hopes of being able to most accurately predict the closeness column for the original file

In [69]:
# We begin by looking at 2 different introductory Models that helps us visualize the data
#GradientBoostingClassifier
#GradientBoostingRegressor
# Before we begin with these Models, we need to format the data in a manner in which we can work with
dfcleanX = dfclean.fillna(-1)
dfcleanX

Unnamed: 0,Subject.Id,Contact.ID,cat,svy_contact,empstatus,jailprison,close,baseline_flag,dup,reinstall_flag1,...,Number of outgoing calls at night,Any night incoming call,Any night outgoing call,Min days between calls,Max days between calls,Total number of texts,Any night text,Number of texts at night,Min days between texts,Max days between texts
2,1003,a0000022d8b566-682408058,Phone survey,Person,Working Formally,0.0,60.0,1.0,0.0,0.0,...,0.0,0.0,0.0,-1.0,-1.0,4.0,0.0,0.0,-1.0,-1.0
17,1004,a0000022aa5cd6-212065659,Phone survey,Person,Working Formally,0.0,100.0,0.0,0.0,1.0,...,0.0,0.0,0.0,-1.0,-1.0,2.0,0.0,0.0,-1.0,-1.0
24,1004,a0000022aa5cd61447330108,Phone survey,Person,Not Working,0.0,100.0,0.0,1.0,1.0,...,4.0,1.0,1.0,2.0,2.0,14.0,1.0,12.0,1.0,4.0
28,1004,a0000022aa5cd62103159229,Phone survey,Person,-1,1.0,55.0,0.0,0.0,1.0,...,5.0,0.0,1.0,1.0,3.0,2.0,1.0,2.0,-1.0,-1.0
33,1004,a0000022aa5cd6846559228,Phone survey,Person,Working Informally,1.0,100.0,0.0,1.0,1.0,...,11.0,1.0,1.0,1.0,7.0,7.0,1.0,2.0,-1.0,-1.0
36,1004,a0000022d95d9b-1043125280,Phone survey,Person,Not Working,0.0,100.0,0.0,1.0,1.0,...,0.0,0.0,0.0,3.0,3.0,1.0,0.0,0.0,-1.0,-1.0
46,1004,a0000022d95d9b-879869607,Phone survey,Person,Self-employed,0.0,100.0,0.0,0.0,1.0,...,0.0,0.0,0.0,-1.0,-1.0,3.0,1.0,1.0,34.0,34.0
52,1004,a0000022d95d9b17610566,Phone survey,Person,Not Working,0.0,100.0,0.0,0.0,1.0,...,3.0,0.0,1.0,-1.0,-1.0,3.0,1.0,1.0,5.0,5.0
61,1004,a0000022d95d9b846559228,Phone survey,Person,Working Informally,1.0,100.0,0.0,0.0,1.0,...,9.0,1.0,1.0,1.0,15.0,5.0,1.0,2.0,2.0,33.0
67,1005,a0000022aa2754-765626467,Phone survey,Person,Working Formally,1.0,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,7.0,7.0,1.0,0.0,0.0,-1.0,-1.0


In [70]:
# We filter out the specific columns we are interested in

xdata = dfcleanX[['Total number of calls',
'Total duration of calls',
'Total incoming calls',
'Total outgoing calls',
'Total number of missed calls',
'Total number of incoming calls, excluding missed',
'Proportion of incoming calls answered',
'Proportion of calls incoming',
'Duration of incoming calls',
'Duration of outgoing calls',
'Total number of night calls',
'Any night call',
'Number of incoming calls at night',
'Number of outgoing calls at night',
'Any night incoming call',
'Any night outgoing call',
'Min days between calls',
'Max days between calls',
'Total number of texts',
'Any night text',
'Number of texts at night',
'Min days between texts',
'Max days between texts']].values 

In [72]:
# We implement a random 80/20 split of the data to be used for train/test

random.seed = 0
indices = np.array(range(2869))
random.shuffle(indices)
num_train = int(len(indices)*0.8)
train_indices = indices[:num_train]
test_indices =indices[num_train+1:]

# namimg variables that will be used for training/testing
xtrain = xdata[train_indices]
xtest = xdata[test_indices]

# ravel creates 1d array
y = dfclean[['close']].values.ravel()
ytrain = y[train_indices]
ytest = y[test_indices]

In [None]:
# GradientBoostingClassifier

In [None]:
# GradientBoostingRegressor

In [None]:
# We finally look at 2 different Advanced Models that helps us visualize the data
# SVR
# MLPRegressor
# Before we begin with these Models, we need to format the data in a manner in which we can work with
# Imputer

In [None]:
# Imputer and format data

In [None]:
# SVR

In [None]:
# MLPRegressor

In [None]:
# Finally we write out the new dataFrame with the "Predicted Closeness" column

dfRaw = dfRaw.to_csv('contactid_08_01_18Predictions.csv', encoding="utf-8")