# Imports

In [1]:
import pandas as pd
import numpy as np
from statistics import mean, median
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import category_encoders as ce #pip install category_encoders
import nltk #pip install nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
nltk.download('punkt')
nltk.download('stopwords')
import gensim
import gensim.downloader as gensim_api
from gensim.models import word2vec

[nltk_data] Downloading package punkt to /home/ben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Funciones

In [2]:
def binary_encoder(col_name, col):
    
    ce_bin = ce.BinaryEncoder(cols = [col_name], drop_invariant=True)
    
    return ce_bin.fit_transform(col)

## text

###  tf-idf

In [3]:
def tf_idf_encoder(col_text):
    
    tf_idf_vec = TfidfVectorizer(use_idf=True, smooth_idf=False, ngram_range=(1,1), tokenizer=nltk.word_tokenize)
    tf_idf_data = tf_idf_vec.fit_transform(col_text)
    tf_idf_array = tf_idf_data.toarray()
    vocabulary = tf_idf_vec.vocabulary_
    
    return tf_idf_array, vocabulary

In [4]:
def calculate_values(col, tf_idf_array, vocabulary):

    len_text = len(col)
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_text):
        values = []
        for word in col.iloc[i].split():
            if word in vocabulary:
                pos_word = vocabulary[word]
                values.append(tf_idf_array[i][pos_word])
            
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
        else:
            sum_values.append(0)
            min_values.append(0)
            max_values.append(0)
            mean_values.append(0)
            median_values.append(0)
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

### word2vec

In [5]:
model_google_news = gensim_api.load("word2vec-google-news-300")

In [6]:
def get_w2v_model(list_of_words):
    model = word2vec.Word2Vec(min_count=2)
    model.build_vocab(list_of_words)
    model.train(list_of_words, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [7]:
def cos(x, y):
    return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [8]:
def calculate_values_w2v(col, w2v):
    
    len_col = len(col)
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_col):
        words = col.iloc[i].split()
        array_values = []
        for word in words:
            if word in w2v:
                array_values.append(w2v[word])
        values = []
        len_array_values = len(array_values)
        for j in range(len_array_values):
            for k in range(j+1, len_array_values):
                values.append(cos(array_values[j], array_values[k]))
                
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
        else:
            sum_values.append(0)
            min_values.append(0)
            max_values.append(0)
            mean_values.append(0)
            median_values.append(0)
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

In [9]:
def calculate_sum_of_norms(col, w2v, num_n_gram=1):
    
    len_col = len(col)
    sum_values = [];
    
    for i in range(len_col):
        words = []
        n_grams = ngrams(col.iloc[i].split(), num_n_gram)
        for grams in n_grams:
            words.append(' '.join(grams))
        array_values = []
        for word in words:
            if word in w2v:
                array_values.append(w2v[word])
        values = []
        len_array_values = len(array_values)
        for j in range(len_array_values):
            values.append(np.linalg.norm(array_values[j]))
                
        if values:
            sum_values.append(sum(values))
        else:
            sum_values.append(0)
            
    column_values = {'sum': sum_values}
    return column_values

In [10]:
def calculate_norm_of_sum(col, w2v, num_n_gram=1):
    
    len_col = len(col)
    norms = [];
    
    for i in range(len_col):
        words = []
        n_grams = ngrams(col.iloc[i].split(), num_n_gram)
        for grams in n_grams:
            words.append(' '.join(grams))
        array_values = []
        for word in words:
            if word in w2v:
                array_values.append(w2v[word])
                
        norms.append(np.linalg.norm(np.sum(array_values, axis=0)))
                
    return norms

## len

In [11]:
def calculate_len(col):
    
    len_string = []
    len_col = len(col)
    for i in range(len_col):
        if col.iloc[i] is not np.nan:
            len_string.append(len(col.iloc[i]))
        else:
            len_string.append(0)
        
    return len_string

## n_gram

In [12]:
def get_n_gram_list(col, num_n_gram=1):
    len_col = len(col)
    words = []
    for i in range(len_col):    
        n_grams = ngrams(col.iloc[i].split(), num_n_gram)
        for grams in n_grams:
            words.append([' '.join(grams)])
            
    return words

# Train

In [122]:
train = pd.read_csv('train/train_limpio.csv')

## keyword

#### Sin stemming

In [123]:
column_values = calculate_sum_of_norms(train['keyword_sin_stemming'].astype('str'), model_google_news)

train['keyword_value_sin_stemming'] = [value for value in column_values['sum']]

#### Con stemming

In [124]:
keyword_as_list = get_n_gram_list(train['keyword_con_stemming'].astype('str'))
model_keyword = get_w2v_model(keyword_as_list)

column_values = calculate_sum_of_norms(train['keyword_con_stemming'].astype('str'), model_keyword)

train["keyword_value_con_stemming"] = [value for value in column_values['sum']]

  if word in w2v:
  array_values.append(w2v[word])


In [125]:
train['keyword_value_mult'] = train['keyword_value_sin_stemming']*train['keyword_value_con_stemming']
train['keyword_value_diff'] = abs(train['keyword_value_sin_stemming']-train['keyword_value_con_stemming'])

In [126]:
tf_idf_array, vocabulary = tf_idf_encoder(train['keyword_con_stemming'].astype('str'))

column_values = calculate_values(train['keyword_con_stemming'].astype('str'), tf_idf_array, vocabulary)

train['keyword_sum'] = [value for value in column_values['sum']]
train['keyword_min'] = [value for value in column_values['min']]
train['keyword_max'] = [value for value in column_values['max']]
train['keyword_mean'] = [value for value in column_values['mean']]
train['keyword_median'] = [value for value in column_values['median']]

In [127]:
len_keyword = calculate_len(train['keyword_con_stemming'])

pos_col_keyword = train.columns.get_loc('keyword_con_stemming')+1
train.insert(loc=pos_col_keyword, column='len_keyword', value=len_keyword)

## location

##### w2v

In [128]:
for i in range(1, 4):
    location_as_list = get_n_gram_list(train['location'].astype('str'), i)
    model_location = get_w2v_model(location_as_list)
    
    column_values = calculate_sum_of_norms(train['location'].astype('str'), model_location, i)
    
    column_name = "location_value_" + str(i) + "_gram"
    train[column_name] = [value for value in column_values['sum']]
    
    column_values = calculate_norm_of_sum(train['location'].astype('str'), model_location, i)
    
    column_name = "location_norm_value_" + str(i) + "_gram"
    train[column_name] = [value for value in column_values]

  if word in w2v:
  array_values.append(w2v[word])


In [129]:
location_as_list = get_n_gram_list(train['location'].astype('str'), 1)
model_location = get_w2v_model(location_as_list)

In [130]:
column_values = calculate_values_w2v(train['location'].astype('str'), model_location)

train['location_sum_w2v'] = [value for value in column_values['sum']]
train['location_min_w2v'] = [value for value in column_values['min']]
train['location_max_w2v'] = [value for value in column_values['max']]
train['location_mean_w2v'] = [value for value in column_values['mean']]
train['location_median_w2v'] = [value for value in column_values['median']]

  if word in w2v:
  array_values.append(w2v[word])


##### tf-idf

In [131]:
tf_idf_array, vocabulary = tf_idf_encoder(train['location'].astype('str'))

column_values = calculate_values(train['location'].astype('str'), tf_idf_array, vocabulary)

train['location_sum_tf-idf'] = [value for value in column_values['sum']]
train['location_min_tf-idf'] = [value for value in column_values['min']]
train['location_max_tf-idf'] = [value for value in column_values['max']]
train['location_mean_tf-idf'] = [value for value in column_values['mean']]
train['location_median_tf-idf'] = [value for value in column_values['median']]

In [132]:
len_location = calculate_len(train['location'])

pos_col_location = train.columns.get_loc('location')+1
train.insert(loc=pos_col_location, column='len_location', value=len_location)

## text

#### tf-idf

##### Con stemming

In [133]:
tf_idf_array, vocabulary = tf_idf_encoder(train['text_con_stemming'])

column_values = calculate_values(train['text_con_stemming'], tf_idf_array, vocabulary)

train['text_sum_tf-idf'] = [value for value in column_values['sum']]
train['text_min_tf-idf'] = [value for value in column_values['min']]
train['text_max_tf-idf'] = [value for value in column_values['max']]
train['text_mean_tf-idf'] = [value for value in column_values['mean']]
train['text_median_tf-idf'] = [value for value in column_values['median']]

#### word2vec

##### Sin stemming

In [134]:
column_values = calculate_sum_of_norms(train['text_sin_stemming'], model_google_news)

train['text_value'] = [value for value in column_values['sum']]

In [135]:
column_values = calculate_norm_of_sum(train['text_sin_stemming'], model_google_news)

train['text_norm_value'] = [value for value in column_values]

In [136]:
column_values = calculate_values_w2v(train['text_sin_stemming'], model_google_news)

train['text_sum_w2v'] = [value for value in column_values['sum']]
train['text_min_w2v'] = [value for value in column_values['min']]
train['text_max_w2v'] = [value for value in column_values['max']]
train['text_mean_w2v'] = [value for value in column_values['mean']]
train['text_median_w2v'] = [value for value in column_values['median']]

##### Con stemming

In [137]:
for i in range(1, 5):
    text_as_list = get_n_gram_list(train['text_con_stemming'].astype('str'), i)
    model_text = get_w2v_model(text_as_list)
    
    column_values = calculate_sum_of_norms(train['text_con_stemming'].astype('str'), model_text, i)
    
    column_name = "text_value_" + str(i) + "_gram"
    train[column_name] = [value for value in column_values['sum']]
    
    column_values = calculate_norm_of_sum(train['text_con_stemming'].astype('str'), model_text, i)
    
    column_name = "text_norm_value_" + str(i) + "_gram"
    train[column_name] = [value for value in column_values]

  if word in w2v:
  array_values.append(w2v[word])


In [138]:
train['text_value_mult'] = train['text_value']*train['text_value_1_gram']
train['text_value_diff'] = abs(train['text_value']-train['text_value_1_gram'])
train['text_norm_value_mult'] = train['text_norm_value']*train['text_norm_value_1_gram']
train['text_norm_value_diff'] = abs(train['text_norm_value']-train['text_norm_value_1_gram'])

In [139]:
train

Unnamed: 0,location,len_location,keyword_sin_stemming,keyword_con_stemming,len_keyword,text_con_stemming,text_sin_stemming,target,keyword_value_sin_stemming,keyword_value_con_stemming,...,text_value_2_gram,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram,text_value_mult,text_value_diff,text_norm_value_mult,text_norm_value_diff
0,,0,,,0,deed reason earthquak may alah forgiv us al,our deeds are the reason of this earthquake ma...,1,3.267254,0.03008,...,0.057406,0.039129,0.000000,0.000000,0.000000,0.000000,6.328488,27.200895,1.034131,12.431300
1,,0,,,0,forest fire near la rong sask canada,forest fire near la ronge sask canada,1,3.267254,0.03008,...,0.058172,0.039828,0.000000,0.000000,0.000000,0.000000,2.183374,14.948923,0.482472,7.746799
2,,0,,,0,al resid ask shelter place notifi offic evacu ...,al residents asked to shelter in place are bei...,1,3.267254,0.03008,...,0.058736,0.058736,0.000000,0.000000,0.000000,0.000000,15.902513,50.867533,2.458198,21.204076
3,,0,,,0,number peopl receiv wildfir evacu order califo...,number people receive wildfires evacuation o...,1,3.267254,0.03008,...,0.058909,0.040829,0.000000,0.000000,0.000000,0.000000,4.867260,24.192476,0.804374,11.435785
4,,0,,,0,got sent photo rubi alaska smoke wildfir pour ...,just got sent this photo from ruby alaska as s...,1,3.267254,0.03008,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.930953,38.388048,1.239480,15.629609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,,0,,,0,two giant crane hold bridg colaps nearbi home ...,two giant cranes holding a bridge colapse into...,1,3.267254,0.03008,...,0.229595,0.081231,0.200066,0.076401,0.175942,0.074368,6.251264,23.519079,0.907812,10.779492
7609,,0,,,0,ariahrari thetawniest control wild fire califo...,ariahrary thetawniest the out of control wild ...,1,3.267254,0.03008,...,0.290137,0.085929,0.255415,0.090223,0.228933,0.070228,10.658496,33.534944,1.475837,15.424894
7610,,0,,,0,number number number number utc number km volc...,m number number number number utc numbe...,1,3.267254,0.03008,...,0.256325,0.112936,0.238182,0.086771,0.206868,0.077030,8.823204,30.397755,2.377698,15.240896
7611,,0,,,0,polic investig e bike colid car littl portug e...,police investigating after an e bike colided w...,1,3.267254,0.03008,...,0.462007,0.111862,0.433861,0.109249,0.406871,0.112182,25.300413,51.636186,2.709726,21.319579


### Removemos target

In [140]:
cols = list(train.columns)
cols.remove('target')
cols.append('target')
train = train[cols]

## len_text

In [141]:
len_text = calculate_len(train['text_con_stemming'])

pos_col_text = train.columns.get_loc('text_con_stemming')+1
train.insert(loc=pos_col_text, column='len_text', value=len_text)
train.head(10)

Unnamed: 0,location,len_location,keyword_sin_stemming,keyword_con_stemming,len_keyword,text_con_stemming,len_text,text_sin_stemming,keyword_value_sin_stemming,keyword_value_con_stemming,...,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram,text_value_mult,text_value_diff,text_norm_value_mult,text_norm_value_diff,target
0,,0,,,0,deed reason earthquak may alah forgiv us al,43,our deeds are the reason of this earthquake ma...,3.267254,0.03008,...,0.039129,0.0,0.0,0.0,0.0,6.328488,27.200895,1.034131,12.4313,1
1,,0,,,0,forest fire near la rong sask canada,36,forest fire near la ronge sask canada,3.267254,0.03008,...,0.039828,0.0,0.0,0.0,0.0,2.183374,14.948923,0.482472,7.746799,1
2,,0,,,0,al resid ask shelter place notifi offic evacu ...,72,al residents asked to shelter in place are bei...,3.267254,0.03008,...,0.058736,0.0,0.0,0.0,0.0,15.902513,50.867533,2.458198,21.204076,1
3,,0,,,0,number peopl receiv wildfir evacu order califo...,50,number people receive wildfires evacuation o...,3.267254,0.03008,...,0.040829,0.0,0.0,0.0,0.0,4.86726,24.192476,0.804374,11.435785,1
4,,0,,,0,got sent photo rubi alaska smoke wildfir pour ...,52,just got sent this photo from ruby alaska as s...,3.267254,0.03008,...,0.0,0.0,0.0,0.0,0.0,9.930953,38.388048,1.23948,15.629609,1
5,,0,,,0,rockyfir updat california hwi number close dir...,84,rockyfire update california hwy number clo...,3.267254,0.03008,...,0.048779,0.0,0.0,0.0,0.0,14.091701,36.97988,1.732342,15.621549,1
6,,0,,,0,flood disast heavi rain caus flash flood stree...,74,flood disaster heavy rain causes flash floodin...,3.267254,0.03008,...,0.056748,0.0,0.0,0.0,0.0,10.891867,34.133985,1.789968,16.191501,1
7,,0,,,0,top hil see fire wood,21,i am on top of the hil and i can see a fire in...,3.267254,0.03008,...,0.030275,0.0,0.0,0.0,0.0,4.093972,26.802238,0.806041,12.256102,1
8,,0,,,0,emerg evacu happen build across street,38,there is an emergency evacuation happening now...,3.267254,0.03008,...,0.028947,0.0,0.0,0.0,0.0,4.706527,27.632688,0.9612,13.417604,1
9,,0,,,0,afraid tornado come area,24,i am afraid that the tornado is coming to our ...,3.267254,0.03008,...,0.0,0.0,0.0,0.0,0.0,2.6973,22.948983,0.702365,11.044126,1


## len_text_original

In [142]:
train_original = pd.read_csv('train/train_original.csv')
train_original.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [143]:
len_text = calculate_len(train_original['text'])

pos_col_text = train_original.columns.get_loc('text')+1
train_original.insert(loc=pos_col_text, column='len_text_original', value=len_text)
train_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,69,1
1,4,,,Forest fire near La Ronge Sask. Canada,38,1
2,5,,,All residents asked to 'shelter in place' are ...,133,1
3,6,,,"13,000 people receive #wildfires evacuation or...",65,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,88,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,110,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,95,1
7,13,,,I'm on top of the hill and I can see a fire in...,59,1
8,14,,,There's an emergency evacuation happening now ...,79,1
9,15,,,I'm afraid that the tornado is coming to our a...,52,1


## Juntamos todo

In [144]:
cols = train.columns.tolist()
cols.remove('location')
cols.remove('text_con_stemming')
cols.remove('text_sin_stemming')
cols.remove('keyword_con_stemming')
cols.remove('keyword_sin_stemming')
cols.remove('target')
cols.append('target')
train = train[cols]

In [145]:
train_encoded = train[cols]

pos_col_text = train_encoded.columns.get_loc('len_text')+1
train_encoded.insert(loc=pos_col_text, column='len_text_original', value=train_original['len_text_original'])

In [146]:
train_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=abs(train_encoded['len_text_original']-train_encoded['len_text']))

## Resultado

In [147]:
train_encoded

Unnamed: 0,len_location,len_keyword,len_text,len_text_original,diff_len_text,keyword_value_sin_stemming,keyword_value_con_stemming,keyword_value_mult,keyword_value_diff,keyword_sum,...,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram,text_value_mult,text_value_diff,text_norm_value_mult,text_norm_value_diff,target
0,0,0,43,69,26,3.267254,0.03008,0.098278,3.237175,1.0,...,0.039129,0.000000,0.000000,0.000000,0.000000,6.328488,27.200895,1.034131,12.431300,1
1,0,0,36,38,2,3.267254,0.03008,0.098278,3.237175,1.0,...,0.039828,0.000000,0.000000,0.000000,0.000000,2.183374,14.948923,0.482472,7.746799,1
2,0,0,72,133,61,3.267254,0.03008,0.098278,3.237175,1.0,...,0.058736,0.000000,0.000000,0.000000,0.000000,15.902513,50.867533,2.458198,21.204076,1
3,0,0,50,65,15,3.267254,0.03008,0.098278,3.237175,1.0,...,0.040829,0.000000,0.000000,0.000000,0.000000,4.867260,24.192476,0.804374,11.435785,1
4,0,0,52,88,36,3.267254,0.03008,0.098278,3.237175,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,9.930953,38.388048,1.239480,15.629609,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,50,83,33,3.267254,0.03008,0.098278,3.237175,1.0,...,0.081231,0.200066,0.076401,0.175942,0.074368,6.251264,23.519079,0.907812,10.779492,1
7609,0,0,82,125,43,3.267254,0.03008,0.098278,3.237175,1.0,...,0.085929,0.255415,0.090223,0.228933,0.070228,10.658496,33.534944,1.475837,15.424894,1
7610,0,0,60,65,5,3.267254,0.03008,0.098278,3.237175,1.0,...,0.112936,0.238182,0.086771,0.206868,0.077030,8.823204,30.397755,2.377698,15.240896,1
7611,0,0,96,137,41,3.267254,0.03008,0.098278,3.237175,1.0,...,0.111862,0.433861,0.109249,0.406871,0.112182,25.300413,51.636186,2.709726,21.319579,1


## Guardado del dataframe

In [148]:
train_encoded.to_csv('train/train_encoded.csv', index=False)

# Test

In [149]:
test = pd.read_csv('test/test_limpio.csv')

## keyword

#### Sin stemming

In [150]:
column_values = calculate_sum_of_norms(test['keyword_sin_stemming'].astype('str'), model_google_news)

test['keyword_value_sin_stemming'] = [value for value in column_values['sum']]

#### Con stemming

In [151]:
keyword_as_list = get_n_gram_list(test['keyword_con_stemming'].astype('str'), 1)
model_keyword = get_w2v_model(keyword_as_list)

column_values = calculate_sum_of_norms(test['keyword_con_stemming'].astype('str'), model_keyword, 1)

test["keyword_value_con_stemming"] = [value for value in column_values['sum']]

  if word in w2v:
  array_values.append(w2v[word])


In [152]:
test['keyword_value_mult'] = test['keyword_value_sin_stemming']*test['keyword_value_con_stemming']
test['keyword_value_diff'] = abs(test['keyword_value_sin_stemming']-test['keyword_value_con_stemming'])

In [153]:
tf_idf_array, vocabulary = tf_idf_encoder(test['keyword_con_stemming'].astype('str'))

column_values = calculate_values(test['keyword_con_stemming'].astype('str'), tf_idf_array, vocabulary)

test['keyword_sum'] = [value for value in column_values['sum']]
test['keyword_min'] = [value for value in column_values['min']]
test['keyword_max'] = [value for value in column_values['max']]
test['keyword_mean'] = [value for value in column_values['mean']]
test['keyword_median'] = [value for value in column_values['median']]

In [154]:
len_keyword = calculate_len(test['keyword_con_stemming'])

pos_col_keyword = test.columns.get_loc('keyword_con_stemming')+1
test.insert(loc=pos_col_keyword, column='len_keyword', value=len_keyword)

## location

##### word2vec

In [155]:
for i in range(1, 4):
    location_as_list = get_n_gram_list(test['location'].astype('str'), i)
    model_location = get_w2v_model(location_as_list)
    
    column_values = calculate_sum_of_norms(test['location'].astype('str'), model_location, i)
    
    column_name = "location_value_" + str(i) + "_gram"
    test[column_name] = [value for value in column_values['sum']]
    
    column_values = calculate_norm_of_sum(test['location'].astype('str'), model_location, i)
    
    column_name = "location_norm_value_" + str(i) + "_gram"
    test[column_name] = [value for value in column_values]

  if word in w2v:
  array_values.append(w2v[word])


In [156]:
column_values = calculate_values_w2v(test['location'].astype('str'), model_location)

test['location_sum_w2v'] = [value for value in column_values['sum']]
test['location_min_w2v'] = [value for value in column_values['min']]
test['location_max_w2v'] = [value for value in column_values['max']]
test['location_mean_w2v'] = [value for value in column_values['mean']]
test['location_median_w2v'] = [value for value in column_values['median']]

  if word in w2v:


##### tf-idf

In [157]:
tf_idf_array, vocabulary = tf_idf_encoder(test['location'].astype('str'))

column_values = calculate_values(test['location'].astype('str'), tf_idf_array, vocabulary)

test['location_sum_tf-idf'] = [value for value in column_values['sum']]
test['location_min_tf-idf'] = [value for value in column_values['min']]
test['location_max_tf-idf'] = [value for value in column_values['max']]
test['location_mean_tf-idf'] = [value for value in column_values['mean']]
test['location_median_tf-idf'] = [value for value in column_values['median']]

In [158]:
len_location = calculate_len(test['location'])

pos_col_location = test.columns.get_loc('location')+1
test.insert(loc=pos_col_location, column='len_location', value=len_location)

## text

#### tf-idf

##### Con stemming

In [159]:
test['text_con_stemming'] = test['text_con_stemming'].astype('str')
test['text_sin_stemming'] = test['text_sin_stemming'].astype('str')

In [160]:
tf_idf_array, vocabulary = tf_idf_encoder(test['text_con_stemming'])

column_values = calculate_values(test['text_con_stemming'], tf_idf_array, vocabulary)

test['text_sum_tf-idf'] = [value for value in column_values['sum']]
test['text_min_tf-idf'] = [value for value in column_values['min']]
test['text_max_tf-idf'] = [value for value in column_values['max']]
test['text_mean_tf-idf'] = [value for value in column_values['mean']]
test['text_median_tf-idf'] = [value for value in column_values['median']]

#### word2vec

##### Sin stemming

In [161]:
column_values = calculate_sum_of_norms(test['text_sin_stemming'], model_google_news)

test['text_value'] = [value for value in column_values['sum']]

In [162]:
column_values = calculate_norm_of_sum(test['text_sin_stemming'], model_google_news)

test['text_norm_value'] = [value for value in column_values]

In [163]:
column_values = calculate_values_w2v(test['text_sin_stemming'], model_google_news)

test['text_sum_w2v'] = [value for value in column_values['sum']]
test['text_min_w2v'] = [value for value in column_values['min']]
test['text_max_w2v'] = [value for value in column_values['max']]
test['text_mean_w2v'] = [value for value in column_values['mean']]
test['text_median_w2v'] = [value for value in column_values['median']]

##### Con stemming

In [164]:
for i in range(1, 5):
    text_as_list = get_n_gram_list(test['text_con_stemming'].astype('str'), i)
    model_text = get_w2v_model(text_as_list)
    
    column_values = calculate_sum_of_norms(test['text_con_stemming'].astype('str'), model_text, i)
    
    column_name = "text_value_" + str(i) + "_gram"
    test[column_name] = [value for value in column_values['sum']]
    
    column_values = calculate_norm_of_sum(test['text_con_stemming'].astype('str'), model_text, i)
    
    column_name = "text_norm_value_" + str(i) + "_gram"
    test[column_name] = [value for value in column_values]

  if word in w2v:
  array_values.append(w2v[word])


In [165]:
test['text_value_mult'] = test['text_value']*test['text_value_1_gram']
test['text_value_diff'] = abs(test['text_value']-test['text_value_1_gram'])
test['text_norm_value_mult'] = test['text_norm_value']*test['text_norm_value_1_gram']
test['text_norm_value_diff'] = abs(test['text_norm_value']-test['text_norm_value_1_gram'])

## len_text

In [166]:
len_text = calculate_len(test['text_con_stemming'])

pos_col_text = test.columns.get_loc('text_con_stemming')+1
test.insert(loc=pos_col_text, column='len_text', value=len_text)
test.head(10)

Unnamed: 0,id,location,len_location,keyword_sin_stemming,keyword_con_stemming,len_keyword,text_con_stemming,len_text,text_sin_stemming,keyword_value_sin_stemming,...,text_value_2_gram,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram,text_value_mult,text_value_diff,text_norm_value_mult,text_norm_value_diff
0,0,,0,,,0,happen teribl car crash,23,just happened a terible car crash,3.267254,...,0.02789,0.02789,0.0,0.0,0.0,0.0,1.305677,11.145168,0.433205,7.178612
1,2,,0,,,0,heard earthquak differ citi stay safe everyon,45,heard about earthquake is different cities sta...,3.267254,...,0.060072,0.046347,0.027337,0.027337,0.0,0.0,4.531304,23.119374,0.751209,10.944429
2,3,,0,,,0,forest fire spot pond gees flee across street ...,53,there is a forest fire at spot pond geese are ...,3.267254,...,0.054311,0.037508,0.0,0.0,0.0,0.0,10.661704,45.098771,1.461376,17.442445
3,9,,0,,,0,apocalyps light spokan wildfir,30,apocalypse lighting spokane wildfires,3.267254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.961869,11.282421,0.32802,7.421708
4,11,,0,,,0,typhoon soudelor kil number china taiwan,40,typhoon soudelor kils number in china and ta...,3.267254,...,0.08669,0.05294,0.0,0.0,0.0,0.0,2.802071,15.976074,0.59375,8.324345
5,12,,0,,,0,shake earthquak,15,we are shaking it is an earthquake,3.267254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.474218,17.046467,0.244964,8.79217
6,21,,0,,,0,would probabl stil show life arsen yesterday e...,50,they would probably stil show more life than a...,3.267254,...,0.029976,0.029976,0.0,0.0,0.0,0.0,7.704609,29.44421,1.23131,14.676575
7,22,,0,,,0,hey,3,hey how are you,3.267254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.278601,9.27705,0.185952,6.182016
8,27,,0,,,0,nice hat,8,what a nice hat,3.267254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.441108,7.601308,0.199966,5.171657
9,29,,0,,,0,fuck,4,fuck off,3.267254,...,0.0,0.0,0.0,0.0,0.0,0.0,0.160437,5.178335,0.124176,4.000983


## len_text_original

In [167]:
test_original = pd.read_csv('test/test_original.csv')
test_original.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


## Juntamos todo

In [168]:
cols = test.columns.tolist()
cols.remove('location')
cols.remove('text_con_stemming')
cols.remove('text_sin_stemming')
cols.remove('keyword_con_stemming')
cols.remove('keyword_sin_stemming')
test_encoded = test[cols]

In [169]:
len_text = calculate_len(test_original['text'])

pos_col_text = test_original.columns.get_loc('text')+1
test_original.insert(loc=pos_col_text, column='len_text_original', value=len_text)
test_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original
0,0,,,Just happened a terrible car crash,34
1,2,,,"Heard about #earthquake is different cities, s...",64
2,3,,,"there is a forest fire at spot pond, geese are...",96
3,9,,,Apocalypse lighting. #Spokane #wildfires,40
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45
5,12,,,We're shaking...It's an earthquake,34
6,21,,,They'd probably still show more life than Arse...,72
7,22,,,Hey! How are you?,17
8,27,,,What a nice hat?,16
9,29,,,Fuck off!,9


In [170]:
pos_col_text = test_encoded.columns.get_loc('len_text')+1
test_encoded.insert(loc=pos_col_text, column='len_text_original', value=test_original['len_text_original'])

test_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=abs(test_encoded['len_text_original']-test_encoded['len_text']))

## Resultado

In [171]:
test_encoded

Unnamed: 0,id,len_location,len_keyword,len_text,len_text_original,diff_len_text,keyword_value_sin_stemming,keyword_value_con_stemming,keyword_value_mult,keyword_value_diff,...,text_value_2_gram,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram,text_value_mult,text_value_diff,text_norm_value_mult,text_norm_value_diff
0,0,0,0,23,34,11,3.267254,0.03008,0.098278,3.237175,...,0.027890,0.027890,0.000000,0.000000,0.000000,0.000000,1.305677,11.145168,0.433205,7.178612
1,2,0,0,45,64,19,3.267254,0.03008,0.098278,3.237175,...,0.060072,0.046347,0.027337,0.027337,0.000000,0.000000,4.531304,23.119374,0.751209,10.944429
2,3,0,0,53,96,43,3.267254,0.03008,0.098278,3.237175,...,0.054311,0.037508,0.000000,0.000000,0.000000,0.000000,10.661704,45.098771,1.461376,17.442445
3,9,0,0,30,40,10,3.267254,0.03008,0.098278,3.237175,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.961869,11.282421,0.328020,7.421708
4,11,0,0,40,45,5,3.267254,0.03008,0.098278,3.237175,...,0.086690,0.052940,0.000000,0.000000,0.000000,0.000000,2.802071,15.976074,0.593750,8.324345
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,0,0,44,55,11,3.267254,0.03008,0.098278,3.237175,...,0.026343,0.026343,0.000000,0.000000,0.000000,0.000000,2.791938,19.243501,0.832196,10.592598
3259,10865,0,0,114,139,25,3.267254,0.03008,0.098278,3.237175,...,0.142852,0.066315,0.030143,0.030143,0.000000,0.000000,33.175360,59.897514,2.622387,23.467875
3260,10868,0,0,30,55,25,3.267254,0.03008,0.098278,3.237175,...,0.059477,0.039728,0.030895,0.030895,0.000000,0.000000,2.446438,16.745660,0.504244,8.611057
3261,10874,0,0,40,65,25,3.267254,0.03008,0.098278,3.237175,...,0.141421,0.070943,0.112221,0.059870,0.086311,0.045653,4.107835,20.084189,0.686354,9.431351


## Guardado del dataframe

In [172]:
test_encoded.to_csv('test/test_encoded.csv', index=False)