# SEE ROBERTA NEAR THE BOTTOM OF THIS PYTHON NOTEBOOK 

In [0]:
!pip install pytorch-transformers

Collecting pytorch-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |████████████████████████████████| 184kB 2.8MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 67.9MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/1f/8e/ed5364a06a9ba720fddd9820155cc57300d28f5f43a6fd7b7e817177e642/sacremoses-0.0.35.tar.gz (859kB)
[K     |████████████████████████████████| 860kB 24.6MB/s 
Collecting regex
[?25l  Downloading https://files.pythonhosted.org/packages/e3/8e/cbf2295643d7265e7883326fb4654e643bfc93b3a8a8274d8010a39d8804/regex-2019.11.1-cp36-cp36m-manylinux1_x86_64.whl (643kB)
[K     |█

In [0]:
%%writefile utils.py 

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT classification fine-tuning: utilities to work with GLUE tasks """

from __future__ import absolute_import, division, print_function

import csv
import logging
import os
import sys
from io import open

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score

from multiprocessing import Pool, cpu_count
from tqdm import tqdm

logger = logging.getLogger(__name__)
csv.field_size_limit(2147483647)

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines


class BinaryProcessor(DataProcessor):
    """Processor for the binary data sets"""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return [str(x) for x in range(19)]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples


def convert_example_to_feature(example_row, pad_token=0,
sequence_a_segment_id=0, sequence_b_segment_id=1,
cls_token_segment_id=1, pad_token_segment_id=0,
mask_padding_with_zero=True):
    example, label_map, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id = example_row

    tokens_a = tokenizer.tokenize(example.text_a)

    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids:   0   0   0   0  0     0   0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    tokens = tokens_a + [sep_token]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    if tokens_b:
        tokens += tokens_b + [sep_token]
        segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

    if cls_token_at_end:
        tokens = tokens + [cls_token]
        segment_ids = segment_ids + [cls_token_segment_id]
    else:
        tokens = [cls_token] + tokens
        segment_ids = [cls_token_segment_id] + segment_ids

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    if pad_on_left:
        input_ids = ([pad_token] * padding_length) + input_ids
        input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
        segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
    else:
        input_ids = input_ids + ([pad_token] * padding_length)
        input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if output_mode == "classification":
        label_id = label_map[example.label]
    elif output_mode == "regression":
        label_id = float(example.label)
    else:
        raise KeyError(output_mode)

    return InputFeatures(input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=segment_ids,
                        label_id=label_id)
    

def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer, output_mode,
                                 cls_token_at_end=False, pad_on_left=False,
                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
                                 cls_token_segment_id=1, pad_token_segment_id=0,
                                 mask_padding_with_zero=True,
                                 process_count=cpu_count() - 2):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label : i for i, label in enumerate(label_list)}

    examples = [(example, label_map, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id) for example in examples]

    with Pool(process_count) as p:
        features = list(tqdm(p.imap(convert_example_to_feature, examples, chunksize=100), total=len(examples)))

    return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


processors = {
    "multi-class": BinaryProcessor
}

output_modes = {
    "multi-class": "classification"
}

GLUE_TASKS_NUM_LABELS = {
    "multi-class": 19
}


Writing utils.py


In [0]:
!mkdir data
#!wget https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz -O data/data.tgz

In [0]:
#!tar -xvzf data/data.tgz -C data/
#!mv data/yelp_review_polarity_csv/* data/
#!rm -r data/yelp_review_polarity_csv/
#!rm data/data.tgz

In [0]:
import pandas as pd
from tqdm import tqdm_notebook


In [0]:
#read csv file from google drive
import pandas as pd
import numpy as np
import spacy
import en_core_web_sm
from sklearn.decomposition import PCA 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans


In [0]:
from google.colab import drive
drive.mount('/content/drive')
consumer_complaints = pd.read_csv("/content/drive/My Drive/Con_Complaints.csv") #header=None)


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
consumer = consumer_complaints 
from ast import literal_eval
df = consumer['tokenized']
a= []
for each in df: 
  a.append(literal_eval(each))
  
nlp = spacy.load("en_core_web_sm")
words = a

lengths = np.cumsum([0] + list(map(len, words)))
print(lengths)


flat_words = [item for sublist in words for item in sublist]
doc = spacy.tokens.Doc(nlp.vocab, words=flat_words)

lemmatized = []
# Iterate starting with 1
for index in range(1, len(lengths)):
    # Slice doc as described in the first point
    span = doc[lengths[index - 1] : lengths[index]]
    # Add lemmatized tokens as list to the list
    lemmatized.append([token.lemma_ for token in span])

In [0]:
consumer_complaints['Lemmatized'] = lemmatized

In [0]:
cc = consumer_complaints.sample(n=15000)
cc_train = cc

In [0]:
cc_train.head()

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Complaint,Response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,Complaint_lower,Complaint_clean,Complaint_nostop,tokenized
185441,432328,02/05/2018,Debt collection,I do not know,Took or threatened to take negative or legal a...,Threatened or suggested your credit would be d...,I received a phone call from XXXX ( XXXX ) XXX...,Company believes the complaint is the result o...,Weinberg Mediation Group LLC,MO,633XX,,Consent provided,Web,02/05/2018,Closed with explanation,Yes,,2804704,i received a phone call from xxxx ( xxxx ) xxx...,i received a phone call from which went ...,received phone call went cell phone voicemail ...,"['received', 'phone', 'call', 'went', 'cell', ..."
128869,309017,07/20/2018,Mortgage,Conventional home mortgage,Applying for a mortgage or refinancing an exis...,,We applied for a loan with City Creek Mortgage...,,CITY CREEK MORTGAGE CORP.,UT,840XX,Servicemember,Consent provided,Web,08/16/2018,Closed with explanation,Yes,,2968234,we applied for a loan with city creek mortgage...,we applied for a loan with city creek mortgage...,applied loan city creek mortgage corp never re...,"['applied', 'loan', 'city', 'creek', 'mortgage..."
117896,284805,08/24/2018,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,XXXX lease was closed out 2015 A/c number XXXX...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,NJ,077XX,,Consent provided,Web,08/24/2018,Closed with explanation,Yes,,3001314,xxxx lease was closed out 2015 a/c number xxxx...,lease was closed out a c number experian has...,lease closed c number experian balance due mon...,"['lease', 'closed', 'c', 'number', 'experian',..."
437823,1007228,05/03/2015,Credit reporting,,Incorrect information on credit report,Account status,Experian continues to report a delinquency fro...,Company chooses not to provide a public response,Experian Information Solutions Inc.,TX,780XX,,Consent provided,Web,05/03/2015,Closed with explanation,Yes,Yes,1358118,experian continues to report a delinquency fro...,experian continues to report a delinquency fro...,experian continues report delinquency although...,"['experian', 'continues', 'report', 'delinquen..."
24416,78278,06/18/2019,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,"Hello, XXXX showed up on my credit report at a...",Company has responded to the consumer and the ...,Experian Information Solutions Inc.,FL,322XX,,Consent provided,Web,06/18/2019,Closed with explanation,Yes,,3278286,"hello, xxxx showed up on my credit report at a...",hello showed up on my credit report at a time...,hello showed credit report time even country g...,"['hello', 'showed', 'credit', 'report', 'time'..."


In [0]:
cc_train = cc[["Product", "Complaint"]]
#cc_test = cc_test[["Product", "Complaint"]]
cc_train.to_csv("/content/data/cc_train.csv")
#cc_test.to_csv("/content/data/cc_test.csv")



In [0]:
#Small sample of entire data set - please disregard for run time of Roberta 
#cc_all= pd.read_csv("/content/data/cc_train.csv" ,header=None)
cc_get = cc
cc_get.to_csv("/content/data/cc_get.csv")



In [0]:
cc_train = pd.read_csv("/content/data/cc_train.csv" ,header=None)
#cc_test = pd.read_csv("/content/data/cc_test.csv" ,header=None)

In [0]:
cc_train = cc_train[[1,2]]
#cc_test = cc_test[[1,2]]

In [0]:
cc_train


Unnamed: 0,1,2
0,Product,Complaint
1,Debt collection,They are trying to get me to pay a debt I do n...
2,Credit card or prepaid card,XX/XX/18 - I received information regarding a ...
3,Debt collection,I opened a equity line of credit in XXXX but d...
4,Debt collection,I called a XXXX in which I had a credit card w...
...,...,...
14996,Credit reporting,Account is not mine. I disagree with the inves...
14997,"Payday loan, title loan, or personal loan",In the contract and statements it says after t...
14998,Mortgage,I was given Ocwen as a mortgage company and ha...
14999,Debt collection,XXXX XXXX XXXX has badgered my client about a ...


In [0]:
cc_train.shape

(15001, 2)

In [0]:

#cc_train[1] = (cc_train[1] == 2).astype(int)
#cc_test[1] = (cc_test[1] == 2).astype(int)


In [0]:
train_df = pd.DataFrame({
    'id':range(len(cc_train)),
    'label':cc_train[1],
    'alpha':['a']*cc_train.shape[0],
    'text': cc_train[2].replace(r'\n', ' ', regex=True)
})
train_df.head()

Unnamed: 0,id,label,alpha,text
0,0,Product,a,Complaint
1,1,Debt collection,a,They are trying to get me to pay a debt I do n...
2,2,Credit card or prepaid card,a,XX/XX/18 - I received information regarding a ...
3,3,Debt collection,a,I opened a equity line of credit in XXXX but d...
4,4,Debt collection,a,I called a XXXX in which I had a credit card w...


In [0]:
dev_df = pd.DataFrame({
    'id':range(len(cc_test)),
    'label':cc_test[1],
    'alpha':['a']*cc_test.shape[0],
    'text': cc_test[2].replace(r'\n', ' ', regex=True)
})

dev_df.head()

Unnamed: 0,id,label,alpha,text
0,0,Product,a,Complaint
1,1,Debt collection,a,my identity was stolen in 2017 and my credit h...
2,2,Consumer Loan,a,"Background : I filed Chapter XXXX BK in XXXX, ..."
3,3,Credit card or prepaid card,a,I have made payment every month well over my m...
4,4,"Credit reporting, credit repair services, or o...",a,I can not get my credit report from equifax


In [0]:
#train_df['label'] = train_df['label'].astype("category")
#cat_columns = train_df.select_dtypes(['category']).columns
#train_df[cat_columns] = train_df[cat_columns].apply(lambda x: x.cat.codes)

dev_df['label'] = dev_df['label'].astype("category")
cat_columns = dev_df.select_dtypes(['category']).columns
dev_df[cat_columns] = dev_df[cat_columns].apply(lambda x: x.cat.codes)

In [0]:
import numpy

In [0]:
train_df['label'] = train_df['label'].astype(numpy.int64)
dev_df['label'] = dev_df['label'].astype(numpy.int64)



In [0]:
train_df.to_csv("data/train.tsv", sep='\t', index=False, header=False, columns=train_df.columns)
#dev_df.to_csv('data/dev.tsv', sep='\t', index=False, header=False, columns=dev_df.columns)

#Data Prep ^

#Multi Class

#MultiClass ^

# ROBERTA

In [0]:
import pytorch_transformers

In [0]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4



In [0]:
## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

In [0]:
## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [0]:
print(torch.cuda.is_available())

True


In [0]:
train_df.head()

Unnamed: 0,id,label,alpha,text
0,0,Product,a,Complaint
1,1,Debt collection,a,They are trying to get me to pay a debt I do n...
2,2,Credit card or prepaid card,a,XX/XX/18 - I received information regarding a ...
3,3,Debt collection,a,I opened a equity line of credit in XXXX but d...
4,4,Debt collection,a,I called a XXXX in which I had a credit card w...


In [0]:
train_df = train_df[['label','text']]
train_df.label.unique()

array(['Product', 'Debt collection', 'Credit card or prepaid card',
       'Credit card', 'Bank account or service',
       'Credit reporting, credit repair services, or other personal consumer reports',
       'Credit reporting', 'Student loan', 'Consumer Loan', 'Mortgage',
       'Payday loan, title loan, or personal loan',
       'Checking or savings account', 'Vehicle loan or lease',
       'Money transfer, virtual currency, or money service',
       'Payday loan', 'Money transfers', 'Prepaid card',
       'Other financial service'], dtype=object)

In [0]:
label_to_ix = {}
for label in train_df.label:
        if label not in label_to_ix:
            label_to_ix[label]=len(label_to_ix)
label_to_ix

{'Bank account or service': 4,
 'Checking or savings account': 11,
 'Consumer Loan': 8,
 'Credit card': 3,
 'Credit card or prepaid card': 2,
 'Credit reporting': 6,
 'Credit reporting, credit repair services, or other personal consumer reports': 5,
 'Debt collection': 1,
 'Money transfer, virtual currency, or money service': 13,
 'Money transfers': 15,
 'Mortgage': 9,
 'Other financial service': 17,
 'Payday loan': 14,
 'Payday loan, title loan, or personal loan': 10,
 'Prepaid card': 16,
 'Product': 0,
 'Student loan': 7,
 'Vehicle loan or lease': 12}

In [0]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

100%|██████████| 473/473 [00:00<00:00, 366846.49B/s]


{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 18,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [0]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

100%|██████████| 898823/898823 [00:00<00:00, 11255928.58B/s]
100%|██████████| 456318/456318 [00:00<00:00, 6916959.08B/s]


In [0]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [0]:
msg = "My dog is cute!"
prepare_features(msg)



(tensor([[    0,  1308,  2335,    16, 11962,   328,     2]]),
 [1, 1, 1, 1, 1, 1, 1])

In [0]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        text = self.data.text[index]
        label = self.data.label[index]
        X, _  = prepare_features(text)
        y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [0]:
train_size = 0.8
train_dataset=train_df.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=train_df.drop(train_dataset.index).reset_index(drop=True)

In [0]:
print("FULL Dataset: {}".format(train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (15001, 2)
TRAIN Dataset: (12001, 2)
TEST Dataset: (3000, 2)


In [0]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [0]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'drop_last': False,
          'num_workers': 1}

In [0]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [0]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [0]:
max_epochs = 2
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 2.8632616996765137. Accuracy: 20.066666666666666%
Iteration: 100. Loss: 1.7736173868179321. Accuracy: 27.6%
Iteration: 200. Loss: 2.2238197326660156. Accuracy: 27.6%
Iteration: 300. Loss: 3.974430799484253. Accuracy: 20.066666666666666%
Iteration: 400. Loss: 2.894787311553955. Accuracy: 23.866666666666667%
Iteration: 500. Loss: 2.4232242107391357. Accuracy: 27.5%
Iteration: 600. Loss: 2.4755992889404297. Accuracy: 20.033333333333335%
Iteration: 700. Loss: 2.3641247749328613. Accuracy: 20.1%
Iteration: 800. Loss: 0.9782440662384033. Accuracy: 27.5%
Iteration: 900. Loss: 1.568459153175354. Accuracy: 26.6%
Iteration: 1000. Loss: 1.4331138134002686. Accuracy: 24.766666666666666%
Iteration: 1100. Loss: 1.646653175354004. Accuracy: 21.933333333333334%
Iteration: 1200. Loss: 2.416796922683716. Accuracy: 25.366666666666667%
Iteration: 1300. Loss: 2.121382236480713. Accuracy: 21.933333333333334%
Iteration: 1400. Loss: 3.5293238162994385. Accuracy: 27.6333333333333

In [0]:
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=list(label_to_ix.keys())[pred_label]
  return prediction

In [0]:
get_reply("I noticed that I had an inquiry from this company on my credit report. I contacted the company and they couldn't not provide any information via phone, so I dated a letter to Transunion requesting the inquiry be removed or validation be provided. I received a letter from XXXX XXXX dated XX/XX/XXXX stating that they has validated the inquiry but DID NOT provide a contract, application nor a signature giving them ANY authorization to do a hard a inquiry on my credit on XX/XX/XXXX. I called XXXX XXXX  on XX/XX/XXXX and spoke with a representative  XXXX Employee ID XXXX and she confirmed that the inquiry was inaccurate and would be removed and I would receive a letter within 5-7 business days as they COULD NOT find an account, file not contact that I signed for credit and the inquiry would be removed within 30 days. That has yet to happen. On XX/XX/XXXX I sent a SECOND request to TransUnion and XXXX XXXX. I received yet another response from XXXX XXXX stating that I had applied for credit but they had failed to provide proof of such information. There wasn't a response from TransUnion.")

'Credit reporting, credit repair services, or other personal consumer reports'

In [0]:
get_reply("I have faxed several mortgage modifications with financials to my mortgage company Penney Mac. My income was reduced when I became XXXX, Penney Mac doesn't want to work with me to reduce my mortgage payment. My attorney has filed and injunction to stop them from foreclosing on my home. A foreclosure property sale is set for my property XX/XX/2018 and there is a restraining order in place to stop the property foreclosure sale. Penney Mac is discriminating against me and they refuse to respect consumer rights and federal guidelines")

'Mortgage'

In [0]:
get_reply("lost job late mortgage payments want work suitable solution bank bank sent packet completed returned approximately later received notice submission incomplete needed resubmit certain documents faxed docs number listed another weeks passed received document taped front door saying home scheduled sheriffs sale looking opportunity solve issue include selling home necessary need speak someone ability delay sale work resolution good faith thank")

'Mortgage'

In [0]:
get_reply("submitted dispute regarding old account appearing past statue limitations fcra law requires accounts past years deleted rpeoterd asked companies delete illegal accounts gotten response besides general working disputing account many months defeated getting bureaus ignoring prolonging removal account credit reports patient contacting attorney dispute goes ignored attached derogatory account ink question please see disputes response make sense accounts never indicated disputed plus days patiently waiting going please respond delete account immediately")

'Credit reporting, credit repair services, or other personal consumer reports'

In [0]:
get_reply("judgement made county court common pleas collection agency involved reimer law company working behalf amount stated original order currently stated wage garnishment order felt strange wage garnishment would take effect five years went court system looked found cfpb proposed consent judgement currently wages garnished garnishment decided three years prior cfpbs opinion would like matter looked outside source")

'Debt collection'

In [0]:
get_reply("Wells Fargo Bank is still trying to collected debt that pass the statute of limitation. Wells Fargo is putting false information on my credit report. I am requesting Fargo for name of the person who claim to open account face to face? I feel like Wells Fargo is being discriminated against me because I am XXXX XXXX.")

'Debt collection'

In [0]:
get_reply("I'm participating in the Public Debt Forgiveness Program and submitted my first certification form in XX/XX/XXXX. It certified that I've been working that the XXXX XXXX XXXX XXXX, an qualifying XXXX, since XX/XX/XXXX. My loan was transferred from XXXX to XXXX XXXX shortly thereafter. However, my first statement from XXXX indicates that Ive only made 3 qualifying payments toward the PDF program. Before working at this XXXX, I worked for the XXXX XXXX XXXX, and Im working on obtaining certification of that as well. That is a total of 7-8 years of public service. My account should certainly reflect more than 3 qualifying payments.")

'Student loan'

In [0]:
get_reply("Basically I was given a private student loan from Discover of around {$37000.00} for about XXXX semesters ( XXXX summer plus a full year ) of film school with the XXXX XXXX XXXX University. When it came time to get another loan for my second full year of XXXX, Discover informed me that they were no longer doing business with my school and that they would not be able to give me another loan. Because of this, my Dad had to go into debt by getting a parent plus loan for my second year and after that I was unable to continue attending the school. I was able to earn an XXXX degree but not the full XXXX degree. Now I am barely able to find work and I have had to get in even more debt pursuing other degrees to keep from paying back unimaginable loan debt. I do n't mind paying back my federal loans, but my private loan has now soared to more than {$53000.00} ( For ONE year of XXXX ) with current payments of {$570.00} a month. I 'm not sure who ( other than myself ) is more responsible for my current debt, Discover or the XXXX XXXX XXXX University but either way I was duped ( at age XXXX ) into an enormous amount of debt that will be impossible for me to pay off. My biggest concern with Discover is that had my Dad not taken out that additional loan, I would not have been able to do even more than one year of XXXX and how would I be able to pay back anyone without the benefit of an actual degree? This does not seem right and I would simply like a review of both my loan and my former school to determine if any fraud was applied to my situation. Thank you for taking the time to assist me with this matter.")

'Student loan'