In [1]:
# Importing important files
import pandas as pd
import numpy as np

import re
from nltk.tokenize import word_tokenize
import nltk
from nltk.stem import PorterStemmer
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import roc_auc_score

from sklearn.cross_validation import cross_val_score
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
test_index = test_data.index

print train_data.shape

(1157, 9)


In [3]:
# Remove the rows of train data where there no is text at all

train_data.dropna(axis = 0, subset=['TRANS_CONV_TEXT'], inplace = True)

In [4]:
# Clean the text. Convert it to lowercase, and remove any special characters or digits if any
# The stopwords such as I, me, us etc were not removed because such words are most likely to be associated with a Patient view.

def clean_conversation(text):
    text = text.lower()
    letters_only = re.sub("[^a-zA-Z]", " ", text) 
    return letters_only

In [5]:
train_data['TRANS_CONV_TEXT'] = train_data.apply(lambda x: clean_conversation(x['TRANS_CONV_TEXT']), axis = 1 )
test_data['TRANS_CONV_TEXT'] = test_data.apply(lambda x: clean_conversation(x['TRANS_CONV_TEXT']), axis = 1 )

In [6]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 1000) 

count_feature = vectorizer.fit_transform(train_data['TRANS_CONV_TEXT'])
count_feature_test = vectorizer.fit_transform(test_data['TRANS_CONV_TEXT'])

In [7]:
# TRAIN DATA
tf_transformer = TfidfTransformer(use_idf=False).fit(count_feature)
X_train = tf_transformer.transform(count_feature)
X_train.shape

(1156, 1000)

In [8]:
# TEST DATA
tf_transformer_test = TfidfTransformer(use_idf=False).fit(count_feature_test)
X_test = tf_transformer.transform(count_feature_test)
X_test.shape

(571, 1000)

In [52]:
# Getting the best parameters

rf_parameters = {'n_estimators':[50,100,150,200],'max_depth':[20,40,60,80]}
rf_model = RandomForestClassifier()
clf = GridSearchCV(rf_model,rf_parameters)
clf.fit(X_train,train_data.Patient_Tag)

clf.best_params_

{'max_depth': 60, 'n_estimators': 200}

In [53]:
# 80 and 100 - 0.964
# 20 and 200 - 0.960
# 60 and 150 - 0.9573
# 20 and 50 - 0.9642
# 60 and 200 - 0.9652
# Cross Validation

model = RandomForestClassifier(n_estimators = 200, max_depth = 60)
kfold = cross_validation.KFold((X_train).shape[0], n_folds=4)
result = cross_val_score(model, X_train, train_data.Patient_Tag, cv=kfold, scoring='roc_auc')
result

array([ 0.96997052,  0.93800714,  0.96821476,  0.96000728])

In [54]:
# Applying Random Forest Classifier - Best Score

model = RandomForestClassifier(n_estimators = 100, max_depth = 80)
model.fit(X_train[:750], train_data.Patient_Tag[:750])
result = model.predict_proba(X_train[750:])
roc_auc_score(train_data.Patient_Tag[750:], np.array(result[:,1]))

0.96529700913874272

In [56]:
# Applying model
model.fit(X_train,train_data.Patient_Tag)
prob = model.predict_proba(X_test) 

submission = pd.DataFrame()
submission['Index'] = test_index
submission['Output'] = prob[:,1]
submission.to_csv('Pavleen_Kaur_19Nov95.csv', index = False)

In [14]:
# Applying Naive Bayes Classifier

model = MultinomialNB()
kfold = cross_validation.KFold((X_train).shape[0], n_folds=4)
result = cross_val_score(model, X_train, train_data.Patient_Tag, cv=kfold, scoring='roc_auc')

result

array([ 0.94716286,  0.93979228,  0.96337286,  0.94490539])

In [15]:
model = MultinomialNB()
model.fit(X_train[:750], train_data.Patient_Tag[:750])
result = model.predict_proba(X_train[750:])
roc_auc_score(train_data.Patient_Tag[750:], np.array(result[:,1]))

0.94354057047909168

In [16]:
# Applying Logistic Regression
model=LogisticRegression()

kfold = cross_validation.KFold((X_train).shape[0], n_folds=4)
result = cross_val_score(model, X_train, train_data.Patient_Tag, cv=kfold, scoring='roc_auc')

result

array([ 0.95165807,  0.93865628,  0.96587615,  0.95422125])

In [17]:
model=LogisticRegression()
model.fit(X_train[:750], train_data.Patient_Tag[:750])
result = model.predict_proba(X_train[750:])
roc_auc_score(train_data.Patient_Tag[750:], np.array(result[:,1]))

0.95638327333148709