In [59]:
# see also 06-Natural-Language_processing notebooks. This one focuses only on doc2vec.
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk
import os
import smart_open
import collections
import scipy.stats as stats

from gensim.models import word2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
from gensim.test.utils import get_tmpfile

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

from nltk import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV¶
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
import xgboost

SyntaxError: invalid character in identifier (<ipython-input-59-9343757485d1>, line 31)

In [6]:
os.chdir('/Users/patrickrs/Documents/GitLab/patrick-steiner/Exercises')
target = pd.read_csv('data/train_target.csv', index_col = 0)
features = pd.read_csv('data/train_features.csv', index_col = 0)

In [3]:
#TO DO: Clean the columns (removing missing values)
nltk.download('stopwords')
STOP_WORDS = nltk.corpus.stopwords.words('english')

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = re.sub("xxxx", "", sentence)
    sentence = re.sub("xxx", "", sentence)
    sentence = re.sub("xx", "", sentence)
    sentence = re.sub("\s\s+", " ", sentence)
       
    # stemming of words (seems not to affect accuracy, but should make things faster
    porter = PorterStemmer()
    words = word_tokenize(sentence)
    sentence = " ".join([porter.stem(word) for word in words])
      
    sentence = sentence.split(" ")
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
    sentence = " ".join(sentence)
    
    return sentence

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickrs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
features['Consumer complaint narrative'] = [clean_sentence(narrative) for narrative in features['Consumer complaint narrative']]

In [155]:
X_train, X_test, y_train, y_test = train_test_split(features['Consumer complaint narrative'], target, test_size=0.2, random_state=42)

In [177]:
train_corpus = []
for i in range(len(X_train)):
    train_corpus.append(TaggedDocument(words = word_tokenize(X_train.iloc[i]), tags = y_train.iloc[i]))
train_corpus[0]

TaggedDocument(words=['world', 'financ', 'corp', 'inaccur', 'report', 'credit', 'thi', 'account', 'reflect', 'open', 'imposs', 'becaus', 'account', 'charg', 'well', 'date', 'last', 'activ', 'report', 'differ', 'transunion', 'date', 'last', 'activ', 'imper', 'credit', 'report', 'becaus', 'directli', 'affect', 'long', 'advers', 'account', 'report', 'credit', 'disput', 'thi', 'account', 'came', 'back', 'report', 'accur', 'thi', 'creditor', 'violat', 'feder', 'state', 'law', 'follow', 'complaint', 'texa', 'consum', 'credit', 'commission', 'texa', 'attorney', 'gener'], tags=Product    Consumer Loan
Name: 19699, dtype: object)

In [171]:
test_corpus = []    
for i in range(len(X_test)):
    test_corpus.append(word_tokenize(X_test.iloc[i]))
test_corpus[:2]

[['wa',
  'lienfreez',
  'put',
  'check',
  'account',
  'district',
  'attorney',
  'offic',
  'call',
  'chase',
  'legal',
  'depart',
  '1014',
  'day',
  'lien',
  'releas',
  'thi',
  'past',
  'monday',
  'call',
  'chase',
  'legal',
  'dept',
  'wa',
  'told',
  'issu',
  'would',
  'expedit',
  'lien',
  'would',
  'releas',
  'within',
  '2448',
  'hour',
  'today',
  'friday',
  'lien',
  'still',
  'account',
  'spoke',
  'da',
  'offic',
  'spoke',
  'chase',
  'legal',
  'dept',
  '2',
  'hour',
  'told',
  'work',
  'fund',
  'held',
  'illeg',
  'thi',
  'point'],
 ['borrow',
  'much',
  'time',
  'parent',
  'could',
  'nt',
  'help',
  'pay',
  'colleg',
  'struggl',
  'loan',
  'went',
  'loan',
  'navient',
  'ask',
  'mani',
  'time',
  'wa',
  'abl',
  'give',
  'sort',
  'lump',
  'sum',
  'payoff',
  'lower',
  'amount',
  'owe',
  'wa',
  'told',
  'could',
  'onli',
  'repay',
  'found',
  'drown',
  'debt',
  'unmanag',
  'monthli',
  'payment',
  'got',
  

In [172]:
  d2v_model = Doc2Vec(vector_size = 50, 
                    window = 2, 
                    min_count = 2,
                    workers = -1,
                    epochs = 50)

In [173]:
d2v_model.build_vocab(train_corpus)
d2v_model.train(train_corpus, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

In [174]:
inferred_vector = d2v_model.infer_vector(train_corpus[0].words)
type(d2v_model.docvecs.most_similar([inferred_vector], topn=1)[0][0])

str

In [181]:
pred = []
for doc_id in range(len(train_corpus)):
    inferred_vector = d2v_model.infer_vector(train_corpus[doc_id].words)
    pred.append(d2v_model.docvecs.most_similar([inferred_vector], topn=1)[0][0])

In [182]:
print(classification_report(y_train, pred))

                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.08      0.09      0.09      1978
                                                 Checking or savings account       0.08      0.07      0.07      1977
                                                               Consumer Loan       0.08      0.09      0.08      1983
                                                                 Credit card       0.08      0.09      0.09      1988
                                                 Credit card or prepaid card       0.08      0.06      0.07      1983
                                                            Credit reporting       0.08      0.09      0.09      1979
Credit reporting, credit repair services, or other personal consumer reports       0.08      0.07      0.08      1977
                                                       

In [175]:
# testing the model
pred = []
for doc_id in range(len(test_corpus)):
    inferred_vector = d2v_model.infer_vector(test_corpus[doc_id])
    pred.append(d2v_model.docvecs.most_similar([inferred_vector], topn=1)[0][0])

In [176]:
print(classification_report(y_test, pred))

                                                                              precision    recall  f1-score   support

                                                     Bank account or service       0.00      0.00      0.00        22
                                                 Checking or savings account       0.17      0.13      0.15        23
                                                               Consumer Loan       0.11      0.12      0.11        17
                                                                 Credit card       0.00      0.00      0.00        12
                                                 Credit card or prepaid card       0.16      0.24      0.19        17
                                                            Credit reporting       0.18      0.14      0.16        21
Credit reporting, credit repair services, or other personal consumer reports       0.11      0.09      0.10        23
                                                       

In [None]:
list(y_test['Product'])[:5]

In [None]:
pred[:5]

In [None]:
test_features = pd.read_csv('data/test_features.csv', index_col = 0)
X_test2 = test_features['Consumer complaint narrative']

In [None]:
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', svm.SVC())
])
text_clf.fit(X_train, y_train)

In [None]:
y_pred_test = text_clf.predict(X_test2)

In [None]:
y_pred_test = pd.DataFrame(y_pred_test)

In [None]:
y_pred_test['Id'] = y_pred_test.index

In [None]:
y_pred_test['Product'] = y_pred_test[0]

In [None]:
y_pred_test = y_pred_test.drop(columns = 0)

In [None]:
y_pred_test.head()

In [None]:
y_pred_test.to_csv("data/consumer_pred2", index = False)