In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('https://raw.githubusercontent.com/olexandryermilov/deeploma/master/datasets/messages.csv')
data.head

<bound method NDFrame.head of                                message_text  message_type
0                            Hi, I am Sasha          fact
1             Where is the nearest library?      question
2                                   I am 21          fact
3                      She works in library          fact
4                  it is quite unbeliavable         trash
5             book me a table in restaurant       request
6   remind me to drink water in few minutes       request
7             how to get to the city centre      question
8                       i come from ukraine          fact
9                                  ho ho ho         trash
10                                    hello         trash
11                          my name is Kate          fact
12                             good morning         trash
13                          she is not here          fact
14                         my hair is white          fact
15                    when is your birthda

In [2]:
X_train = data['message_text'].values
Y_train = data['message_type'].values
Y_train

array([' fact', ' question', ' fact', ' fact', ' trash', ' request',
       ' request', ' question', ' fact', ' trash', ' trash', ' fact',
       ' trash', ' fact', ' fact', ' question', ' fact', ' fact',
       ' question', ' question', ' question', ' trash'], dtype=object)

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()# lowercase text
    text = re.sub(REPLACE_BY_SPACE_RE," ", text)# replace REPLACE_BY_SPACE_RE symbols by space in text
    text = re.sub(BAD_SYMBOLS_RE,"", text)# delete symbols which are in BAD_SYMBOLS_RE from text
    text = re.sub('  ', ' ', text)
    text = re.sub('  ', ' ', text)
    #text = ' '.join(list(filter(lambda x: x not in STOPWORDS, text.split(" ")))) # delete stopwords from text
    return text

In [0]:
def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for x in text_prepare(text).split(" "):
      if(x in words_to_index):
        result_vector.itemset(words_to_index[x], 1)
    return result_vector

In [0]:
def countQuestions(text):
  x = 0
  for char in text:
    if char == '?':
      x = x + 1
  return x    

In [0]:
def count_wh_words(text):
  x = 0
  for word in text_prepare(text).split(' '):
    if(word == 'when' or word == 'who' or word == 'why' or word =='whom' or word == 'where'):
      x = x + 1
  return x     

In [0]:
words = [x for item in X_train for x in text_prepare(item).split(' ') ]
unique, counts = np.unique(words, return_counts=True)
words_counts= dict(zip(unique, counts))
DICT_SIZE = 5000
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {j:i for i,j in enumerate(words_counts)}
INDEX_TO_WORDS = {i:j for i,j in enumerate(words_counts)}
ALL_WORDS = WORDS_TO_INDEX.keys()

In [9]:
unique_t, counts_t = np.unique([x for x in Y_train], return_counts=True)
tags_counts = dict(zip(unique_t, counts_t))
tags_counts.keys()

dict_keys([' fact', ' question', ' request', ' trash'])

In [0]:
def prepare_text_for_model(text):
  bow = my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)
  questions = np.array([countQuestions(text)])
  wh_words = np.array([count_wh_words(text)])
  return sp_sparse.csr_matrix(np.concatenate((bow, np.concatenate((questions, wh_words)))))

In [11]:
from scipy import sparse as sp_sparse
X_train_mybag = sp_sparse.vstack([prepare_text_for_model(text) for text in X_train])
X_train_mybag.shape

X_train shape  (22, 5002)


In [0]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
TAGS_FOR_INDEX = {j:i for i,j in enumerate(tags_counts)}
INDEX_TO_TAGS = {i:j for i,j in enumerate(tags_counts)}
y_train = [TAGS_FOR_INDEX[x] for x in Y_train]

In [0]:
from sklearn.tree import DecisionTreeClassifier 
dtree_model = DecisionTreeClassifier(max_depth = 2).fit(X_train_mybag, y_train) 

In [14]:
INDEX_TO_TAGS[dtree_model.predict(prepare_text_for_model('Hi, I am 2304123'))[0]]

' fact'