In [1]:
import re
import numpy as np
import pandas as pd


import nltk
from nltk.corpus import words


nltk.download('words')
from sklearn.feature_extraction.text import CountVectorizer

from data_utils import DataUtils
from nlp_utils import NLPUtils

dataUtils = DataUtils()
nlpUtils = NLPUtils()

dataset = pd.read_csv('data/disaster_messages.csv')

dataset.head()

[nltk_data] Downloading package words to /home/mrugeles/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /home/mrugeles/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mrugeles/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mrugeles/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [2]:
# Top words
def get_matrix(data):
    count_vect = CountVectorizer(tokenizer=nlpUtils.tokenize)
    vectorized = count_vect.fit_transform(data)
    return pd.DataFrame(vectorized.toarray(), columns=count_vect.get_feature_names())


data = [
    'The house is white',
    'My car is red'
]

matrix = get_matrix(data)

matrix

Unnamed: 0,car,house,red,white
0,0,1,0,1
1,1,0,1,0


In [3]:
query = ['I have a blue house']

matrix_query = get_matrix(query)
matrix_query

Unnamed: 0,blue,house
0,1,1


In [5]:
from data_utils import DataUtils
from model_utils import ModelUtils
from nlp_utils import NLPUtils

X = [
    'The house is white',
    'My car is red'
]

X = nlpUtils.create_vector_model(X, 'count_vectorizer.p')
X.toarray().shape

count vector features: 4
matrix.shape: (2, 4)
TfidfTransformer features: (2, 4)
Vectorizing time: 0.043463945388793945


(2, 4)

In [7]:
model_features = pd.read_csv('model_features.csv')
display(model_features.loc[model_features['feature'].isna()])
model_features

Unnamed: 0,feature


Unnamed: 0,feature
0,car
1,house
2,red
3,white


In [12]:
import pickle
from textblob import TextBlob
from scipy.sparse import csr_matrix


query = 'I have a blue house'

def vectorize_query(query):
    query = TextBlob(query).correct().string
    matrix_query = get_matrix([query])
    model_features = list(pd.read_csv('model_features.csv')['feature'].values)


    add_features = list(set(model_features).difference(set(matrix_query.columns)))
    remove_features = list(set(matrix_query.columns).difference(set(model_features)))

    n_features = len(add_features)
    n_rows = matrix_query.shape[0]

    display(matrix_query)
    matrix_query = matrix_query.drop(remove_features, axis = 1)
    matrix_query[add_features] = pd.DataFrame(np.zeros((n_rows, n_features), dtype = int), columns = add_features)
    
    features = np.array(matrix_query.columns, dtype=str)
    
    features.sort()

    matrix_query = matrix_query[features]
    
    matrix_query = csr_matrix(matrix_query.values)
    vectorizer = pickle.load( open( 'count_vectorizer.p', "rb" ) )
    
    return vectorizer.transform(matrix_query)
    
%time query_vector = vectorize_query(query)

Unnamed: 0,blue,house
0,1,1


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.31 ms


In [None]:
set1 = set({'blue', 'car', 'house', 'country', 'document'})
set2 = set({'blue', 'river'})
print(set1.difference(set2))
print(set2.difference(set1))


In [13]:
model_features = {'come', 'level', 'diameter', 'mortar', 'record', 'cry', 'start', 'set', 'case', 'south', 'in', 'rout', 'tail', 'hungry', 'addition', 'mamma', 'salute', 'seasonal', 'link', 'spore', 'clean', 'sing', 'pop', 'facility', 'later', 'resistance', 'acute', 'large', 'governorate', 'of', 'life', 'cause', 'heavy', 'prevent', 'project', 'men', 'para', 'impact', 'daily', 'arm', 'metal', 'own', 'act', 'per', 'also', 'unit', 'algebra', 'privately', 'long', 'crop', 'amount', 'provide', 'productivity', 'another', 'country', 'often', 'association', 'la', 'soon', 'undertake', 'an', 'dis', 'detail', 'use', 'delay', 'fish', 'truncated', 'forget', 'hair', 'distribute', 'keep', 'lose', 'southwest', 'disrupt', 'field', 'ruby', 'household', 'station', 'remain', 'approximately', 'receive', 'ha', 'entire', 'detain', 'sorghum', 'medius', 'eradication', 'national', 'accept', 'bitch', 'bower', 'smell', 'news', 'age', 'possibly', 'type', 'part', 'mobility', 'maintenance', 'strength', 'damage', 'month', 'army', 'surge', 'no', 'able', 'leishmaniasis', 'global', 'tropical', 'vassal', 'road', 'invite', 'potential', 'replace', 'cash', 'float', 'hour', 'political', 'two', 'donate', 'happy', 'north', 'white', 'worry', 'prevention', 'rehydration', 'morning', 'accompany', 'de', 'song', 'relief', 'mission', 'clinically', 'share', 'maturity', 'bridge', 'predict', 'train', 'crisis', 'carry', 'order', 'specially', 'collapse', 'cell', 'pass', 'want', 'justice', 'family', 'thirsty', 'average', 'torrential', 'anyone', 'snub', 'survive', 'war', 'office', 'border', 'kit', 'people', 'chair', 'incident', 'seventh', 'grenade', 'well', 'consider', 'point', 'heart', 'health', 'information', 'naturally', 'rep', 'hospital', 'protect', 'market', 'vagina', 'eastern', 'civilian', 'ing', 'international', 'saw', 'parliamentary', 'breed', 'speed', 'disease', 'thunderstorm', 'city', 'premier', 'medial', 'briefly', 'cow', 'outbreak', 'alive', 'section', 'chain', 'county', 'day', 'home', 'look', 'harvest', 'ache', 'agency', 'easily', 'death', 'digital', 'terrible', 'catastrophe', 'farm', 'care', 'considerably', 'roof', 'log', 'coast', 'folio', 'nothing', 'hazardous', 'galvanic', 'hay', 'observe', 'reportedly', 'mozambique', 'treat', 'inactive', 'fast', 'plan', 'pad', 'extreme', 'work', 'ginger', 'demonstrate', 'climate', 'take', 'tank', 'th', 'sustain', 'production', 'blanket', 'vaccine', 'dark', 'essential', 'berg', 'rash', 'put', 'rust', 'lone', 'role', 'phone', 'hand', 'length', 'across', 'government', 'chin', 'neither', 'make', 'problem', 'un', 'man', 'prince', 'recovery', 'vice', 'busy', 'china', 'federation', 'seed', 'as', 'editor', 'minimize', 'hate', 'sponsor', 'mother', 'mutiny', 'comfort', 'create', 'commune', 'wound', 'underground', 'price', 'word', 'furniture', 'pain', 'leg', 'bird', 'destroy', 'drug', 'crowd', 'i', 'express', 'contain', 'mine', 'labour', 'seedless', 'etiology', 'scheme', 'design', 'millions', 'afternoon', 'media', 'haunt', 'current', 'hundred', 'illegal', 'red', 'free', 'goods', 'agreement', 'erratic', 'beach', 'tell', 'decrease', 'emergency', 'port', 'better', 'back', 'mi', 'lot', 'board', 'arrive', 'sell', 'tireless', 'melon', 'yesterday', 'said', 'disaster', 'medical', 'convention', 'like', 'tackle', 'value', 'accord', 'year', 'locate', 'milder', 'patrol', 'increase', 'staff', 'supervision', 'southeast', 'first', 'exacerbate', 'short', 'fertile', 'debris', 'barry', 'hostel', 'resort', 'joint', 'party', 'add', 'ran', 'good', 'brave', 'spread', 'head', 'incidence', 'employ', 'structure', 'immigration', 'downy', 'rescue', 'flood', 'infrastructure', 'jam', 'force', 'mind', 'access', 'develop', 'st', 'underwater', 'gear', 'revenue', 'u', 'bottom', 'never', 'salt', 'exorbitant', 'yet', 'flash', 'baring', 'maybe', 'rain', 'indicate', 'everyone', 'know', 'straightforward', 'western', 'renovate', 'may', 'source', 'effective', 'include', 'oral', 'windfall', 'trend', 'peaceful', 'result', 'protein', 'construct', 'double', 'thank', 'energy', 'promise', 'village', 'state', 'cycle', 'among', 'tornado', 'particularly', 'several', 'supply', 'for', 'allay', 'dictate', 'hurricane', 'our', 'advice', 'arrest', 'capability', 'difficult', 'last', 'shelter', 'center', 'leave', 'situation', 'mountainous', 'da', 'already', 'personnel', 'beer', 'risk', 'graphics', 'high', 'tore', 'coal', 'collect', 'moment', 'assess', 'bar', 'f', 'income', 'management', 'god', 'learn', 'area', 'irrigation', 'geophysics', 'thou', 'litigation', 'palace', 'house', 'livelihood', 'paralytic', 'program', 'recession', 'tumbler', 'violent', 'understand', 'august', 'charge', 'ravage', 'call', 'important', 'eye', 'go', 'transitional', 'right', 'medium', 'forecast', 'mode', 'daddy', 'violate', 'destiny', 'highly', 'meet', 'dehydration', 'czar', 'intense', 'cut', 'l', 'draughts', 'longer', 'message', 'dispatch', 'assistance', 'talk', 'volcanic', 'court', 'panic', 'big', 'tested', 'province', 'cross', 'streets', 'stool', 'channel', 'dura', 'happiness', 'deadly', 'post', 'dramatically', 'even', 'due', 'status', 'saline', 'term', 'violence', 'especially', 'open', 'public', 'wife', 'could', 'imagine', 'security', 'soap', 'spend', 'event', 'phase', 'search', 'deploy', 'ask', 'seal', 'live', 'school', 'help', 'historical', 'presidential', 'poor', 'police', 'expect', 'base', 'much', 'service', 'allegedly', 'garage', 'embankment', 'science', 'wind', 'container', 'possible', 'towards', 'widen', 'x', 'militant', 'report', 'ratify', 'surveillance', 'threaten', 'unable', 'however', 'flee', 'majority', 'plain', 'thatch', 'time', 'ay', 'chile', 'fusion', 'since', 'rehabilitation', 'counter', 'support', 'gold', 'thermal', 'tin', 'suddenly', 'place', 'organization', 'financial', 'sequential', 'fight', 'convert', 'love', 'partner', 'clear', 'medicine', 'responsibility', 'sot', 'maize', 'visceral', 'something', 'network', 'prosecute', 'estimate', 'sandy', 'volunteer', 'power', 'sleep', 'hit', 'march', 'one', 'officer', 'see', 'night', 'satisfaction', 'strong', 'normally', 'miller', 'still', 'east', 'whose', 'major', 'strategy', 'zone', 'canada', 'cool', 'bouquet', 'independent', 'local', 'student', 'maximum', 'hurt', 'rue', 'sister', 'bed', 'launch', 'gulp', 'urge', 'abundance', 'was', 'fire', 'note', 'malady', 'ly', 'response', 'would', 'water', 'target', 'name', 'street', 'bijou', 'visit', 'powder', 'corn', 'landslide', 'growth', 'immediate', 'hard', 'going', 'play', 'storm', 'drive', 'did', 'stop', 'mud', 'currently', 'finish', 'fund', 'give', 'regional', 'round', 'season', 'environment', 'fully', 'cabman', 'my', 'committee', 'pa', 'although', 'drain', 'earthquake', 'end', 'together', 'data', 'quick', 'rice', 'plant', 're', 'mary', 'answer', 'kill', 'show', 'wheat', 'transport', 'delegation', 'peasant', 'deposit', 'minimum', 'directly', 'tent', 'group', 'malaise', 'close', 'council', 'available', 'tap', 'task', 'dedicate', 'please', 'beverage', 'break', 'younger', 'treatment', 'little', 'or', 'suffer', 'ensure', 'three', 'martin', 'operation', 'tongue', 'team', 'coupon', 'piece', 'livestock', 'sensitive', 'assist', 'many', 'establish', 'get', 'efficiency', 'food', 'logistician', 'enter', 'construction', 'capital', 'condition', 'four', 'new', 'milk', 'us', 'line', 'advance', 'economic', 'crater', 'beyond', 'recent', 'land', 'postal', 'added', 'faculty', 'nephew', 'child', 'near', 'ventilator', 'sheet', 'walk', 'publish', 'injectable', 'prepare', 'otherwise', 'serious', 'today', 'foot', 'na', 'pride', 'need', 'blow', 'cycling', 'say', 'coalition', 'calculate', 'clothe', 'world', 'dog', 'initial', 'injure', 'dust', 'fell', 'backwashing', 'river', 'prison', 'extremist', 'reporter', 'number', 'distribution', 'affect', 'colleague', 'cupcake'}
query_features = {'even', 'call', 'signal', 'body', 'earthquake'}

In [14]:
len(model_features)

766