# Imports

In [1]:
import pandas as pd
import numpy as np
from statistics import mean, median
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import category_encoders as ce #pip install category_encoders
import nltk #pip install nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
nltk.download('punkt')
nltk.download('stopwords')
import gensim
import gensim.downloader as gensim_api
from gensim.models import word2vec

[nltk_data] Downloading package punkt to /home/ben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Funciones

In [2]:
def binary_encoder(col_name, col):
    
    ce_bin = ce.BinaryEncoder(cols = [col_name], drop_invariant=True)
    
    return ce_bin.fit_transform(col)

## text

###  tf-idf

In [3]:
def tf_idf_encoder(col_text):
    
    tf_idf_vec = TfidfVectorizer(use_idf=True, smooth_idf=False, ngram_range=(1,1), tokenizer=nltk.word_tokenize)
    tf_idf_data = tf_idf_vec.fit_transform(col_text)
    tf_idf_array = tf_idf_data.toarray()
    vocabulary = tf_idf_vec.vocabulary_
    
    return tf_idf_array, vocabulary

In [4]:
def calculate_values(col, tf_idf_array, vocabulary):

    len_text = len(col)
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_text):
        values = []
        for word in col.iloc[i].split():
            if word in vocabulary:
                pos_word = vocabulary[word]
                values.append(tf_idf_array[i][pos_word])
            
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
        else:
            sum_values.append(0)
            min_values.append(0)
            max_values.append(0)
            mean_values.append(0)
            median_values.append(0)
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

### word2vec

In [5]:
model_google_news = gensim_api.load("word2vec-google-news-300")

# =====================

In [None]:
aux = pd.Series([['hola que tal', 'como esta'], ['no sé qué decir', 'jejeje']])

In [None]:
a = aux.iloc[0][0].split()

In [None]:
#w = {'Hola que': 2, 'tal como': 3}

aux = "Hola que tal como va"
n_grams = ngrams(aux.split(), 1)
c = []
for grams in n_grams:
    c.append([' '.join(grams)])
    
c

In [None]:
model = word2vec.Word2Vec(min_count=1)
model.build_vocab(c)
model.train(c, total_examples=model.corpus_count, epochs=model.epochs)
model.wv.vocab

In [None]:
b = [a, ['no', 'sé', 'qué', 'decir', 'jejeje', 'no']]
b

# =====================

In [6]:
def get_w2v_model(list_of_words):
    model = word2vec.Word2Vec(min_count=2)
    model.build_vocab(list_of_words)
    model.train(list_of_words, total_examples=model.corpus_count, epochs=model.epochs)
    return model

In [7]:
def cos(x, y):
    return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [8]:
def calculate_values_w2v(col, w2v):
    
    len_col = len(col)
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_col):
        words = col.iloc[i].split()
        array_values = []
        for word in words:
            if word in w2v:
                array_values.append(w2v[word])
        values = []
        len_array_values = len(array_values)
        for j in range(len_array_values):
            for k in range(j+1, len_array_values):
                values.append(cos(array_values[j], array_values[k]))
                
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
        else:
            sum_values.append(0)
            min_values.append(0)
            max_values.append(0)
            mean_values.append(0)
            median_values.append(0)
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

In [9]:
def calculate_sum_of_norms(col, w2v, num_n_gram=1):
    
    len_col = len(col)
    sum_values = [];
    
    for i in range(len_col):
        words = []
        n_grams = ngrams(col.iloc[i].split(), num_n_gram)
        for grams in n_grams:
            words.append(' '.join(grams))
#        words = col.iloc[i].split()
        array_values = []
        for word in words:
            if word in w2v:
                array_values.append(w2v[word])
        values = []
        len_array_values = len(array_values)
        for j in range(len_array_values):
            values.append(np.linalg.norm(array_values[j]))
                
        if values:
            sum_values.append(sum(values))
        else:
            sum_values.append(0)
            
    column_values = {'sum': sum_values}
    return column_values

In [10]:
def calculate_norm_of_sum(col, w2v, num_n_gram=1):
    
    len_col = len(col)
    norms = [];
    
    for i in range(len_col):
        words = []
        n_grams = ngrams(col.iloc[i].split(), num_n_gram)
        for grams in n_grams:
            words.append(' '.join(grams))
#        words = col.iloc[i].split()
        array_values = []
        for word in words:
            if word in w2v:
                array_values.append(w2v[word])
                
        norms.append(np.linalg.norm(np.sum(array_values, axis=0)))
                
    return norms

## len

In [11]:
def calculate_len(col):
    
    len_string = []
    len_col = len(col)
    for i in range(len_col):
        if col.iloc[i] is not np.nan:
            len_string.append(len(col.iloc[i]))
        else:
            len_string.append(0)
        
    return len_string

## n_gram

In [12]:
def get_n_gram_list(col, num_n_gram=1):
    len_col = len(col)
    words = []
    for i in range(len_col):    
        n_grams = ngrams(col.iloc[i].split(), num_n_gram)
        for grams in n_grams:
            words.append([' '.join(grams)])
            
    return words

# Train

In [33]:
train = pd.read_csv('train/train_limpio.csv')

## keyword

#### Sin stemming

In [34]:
column_values = calculate_sum_of_norms(train['keyword_sin_stemming'].astype('str'), model_google_news)

train['keyword_value'] = [value for value in column_values['sum']]

#### Con stemming

In [35]:
tf_idf_array, vocabulary = tf_idf_encoder(train['keyword_con_stemming'].astype('str'))

column_values = calculate_values(train['keyword_con_stemming'].astype('str'), tf_idf_array, vocabulary)

train['keyword_sum'] = [value for value in column_values['sum']]
train['keyword_min'] = [value for value in column_values['min']]
train['keyword_max'] = [value for value in column_values['max']]
train['keyword_mean'] = [value for value in column_values['mean']]
train['keyword_median'] = [value for value in column_values['median']]

In [36]:
len_keyword = calculate_len(train['keyword_con_stemming'])

pos_col_keyword = train.columns.get_loc('keyword_con_stemming')+1
train.insert(loc=pos_col_keyword, column='len_keyword', value=len_keyword)

## location

##### w2v

In [37]:
for i in range(1, 4):
    location_as_list = get_n_gram_list(train['location'].astype('str'), i)
    model_location = get_w2v_model(location_as_list)
    
    column_values = calculate_sum_of_norms(train['location'].astype('str'), model_location, i)
    
    column_name = "location_value_" + str(i) + "_gram"
    train[column_name] = [value for value in column_values['sum']]
    
    column_values = calculate_norm_of_sum(train['location'].astype('str'), model_location, i)
    
    column_name = "location_norm_value_" + str(i) + "_gram"
    train[column_name] = [value for value in column_values]

  if word in w2v:
  array_values.append(w2v[word])


In [38]:
location_as_list = get_n_gram_list(train['location'].astype('str'), 1)
model_location = get_w2v_model(location_as_list)

In [39]:
column_values = calculate_values_w2v(train['location'].astype('str'), model_location)

train['location_sum_w2v'] = [value for value in column_values['sum']]
train['location_min_w2v'] = [value for value in column_values['min']]
train['location_max_w2v'] = [value for value in column_values['max']]
train['location_mean_w2v'] = [value for value in column_values['mean']]
train['location_median_w2v'] = [value for value in column_values['median']]

  if word in w2v:
  array_values.append(w2v[word])


##### tf-idf

In [40]:
tf_idf_array, vocabulary = tf_idf_encoder(train['location'].astype('str'))

column_values = calculate_values(train['location'].astype('str'), tf_idf_array, vocabulary)

train['location_sum_tf-idf'] = [value for value in column_values['sum']]
train['location_min_tf-idf'] = [value for value in column_values['min']]
train['location_max_tf-idf'] = [value for value in column_values['max']]
train['location_mean_tf-idf'] = [value for value in column_values['mean']]
train['location_median_tf-idf'] = [value for value in column_values['median']]

In [41]:
len_location = calculate_len(train['location'])

pos_col_location = train.columns.get_loc('location')+1
train.insert(loc=pos_col_location, column='len_location', value=len_location)

## text

#### tf-idf

##### Con stemming

In [42]:
tf_idf_array, vocabulary = tf_idf_encoder(train['text_con_stemming'])

column_values = calculate_values(train['text_con_stemming'], tf_idf_array, vocabulary)

train['text_sum_tf-idf'] = [value for value in column_values['sum']]
train['text_min_tf-idf'] = [value for value in column_values['min']]
train['text_max_tf-idf'] = [value for value in column_values['max']]
train['text_mean_tf-idf'] = [value for value in column_values['mean']]
train['text_median_tf-idf'] = [value for value in column_values['median']]

#### word2vec

##### Sin stemming

In [43]:
column_values = calculate_sum_of_norms(train['text_sin_stemming'], model_google_news)

train['text_value'] = [value for value in column_values['sum']]

In [44]:
column_values = calculate_norm_of_sum(train['text_sin_stemming'], model_google_news)

train['text_norm_value'] = [value for value in column_values]

In [45]:
column_values = calculate_values_w2v(train['text_sin_stemming'], model_google_news)

train['text_sum_w2v'] = [value for value in column_values['sum']]
train['text_min_w2v'] = [value for value in column_values['min']]
train['text_max_w2v'] = [value for value in column_values['max']]
train['text_mean_w2v'] = [value for value in column_values['mean']]
train['text_median_w2v'] = [value for value in column_values['median']]

##### Con stemming

In [46]:
for i in range(1, 5):
    text_as_list = get_n_gram_list(train['text_con_stemming'].astype('str'), i)
    model_text = get_w2v_model(text_as_list)
    
    column_values = calculate_sum_of_norms(train['text_con_stemming'].astype('str'), model_text, i)
    
    column_name = "text_value_" + str(i) + "_gram"
    train[column_name] = [value for value in column_values['sum']]
    
    column_values = calculate_norm_of_sum(train['text_con_stemming'].astype('str'), model_text, i)
    
    column_name = "text_norm_value_" + str(i) + "_gram"
    train[column_name] = [value for value in column_values]

  if word in w2v:
  array_values.append(w2v[word])


In [47]:
cols = list(train.columns)
cols.remove('target')
cols.append('target')
train = train[cols]

## len_text

In [48]:
len_text = calculate_len(train['text_con_stemming'])

pos_col_text = train.columns.get_loc('text_con_stemming')+1
train.insert(loc=pos_col_text, column='len_text', value=len_text)
train.head(10)

Unnamed: 0,location,len_location,keyword_sin_stemming,keyword_con_stemming,len_keyword,text_con_stemming,len_text,text_sin_stemming,keyword_value,keyword_sum,...,text_median_w2v,text_value_1_gram,text_norm_value_1_gram,text_value_2_gram,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram,target
0,,0,,,0,deed reason earthquak may alah forgiv us al,43,our deeds are the reason of this earthquake ma...,3.267254,1.0,...,0.121727,0.227191,0.077967,0.056394,0.040931,0.0,0.0,0.0,0.0,1
1,,0,,,0,forest fire near la rong sask canada,36,forest fire near la ronge sask canada,3.267254,1.0,...,0.055061,0.143954,0.061261,0.056181,0.042664,0.0,0.0,0.0,0.0,1
2,,0,,,0,al resid ask shelter place notifi offic evacu ...,72,al residents asked to shelter in place are bei...,3.267254,1.0,...,0.095317,0.313103,0.11757,0.055537,0.055537,0.0,0.0,0.0,0.0,1
3,,0,,,0,number peopl receiv wildfir evacu order califo...,50,number people receive wildfires evacuation o...,3.267254,1.0,...,0.082558,0.197538,0.078924,0.059248,0.043484,0.0,0.0,0.0,0.0,1
4,,0,,,0,got sent photo rubi alaska smoke wildfir pour ...,52,just got sent this photo from ruby alaska as s...,3.267254,1.0,...,0.086994,0.265268,0.083965,0.0,0.0,0.0,0.0,0.0,0.0,1
5,,0,,,0,rockyfir updat california hwi number close dir...,84,rockyfire update california hwy number clo...,3.267254,1.0,...,0.080587,0.375873,0.104406,0.087474,0.056522,0.0,0.0,0.0,0.0,1
6,,0,,,0,flood disast heavi rain caus flash flood stree...,74,flood disaster heavy rain causes flash floodin...,3.267254,1.0,...,0.109943,0.318629,0.104069,0.115915,0.060897,0.0,0.0,0.0,0.0,1
7,,0,,,0,top hil see fire wood,21,i am on top of the hil and i can see a fire in...,3.267254,1.0,...,0.112546,0.140337,0.058818,0.028337,0.028337,0.0,0.0,0.0,0.0,1
8,,0,,,0,emerg evacu happen build across street,38,there is an emergency evacuation happening now...,3.267254,1.0,...,0.158737,0.171633,0.07333,0.029165,0.029165,0.0,0.0,0.0,0.0,1
9,,0,,,0,afraid tornado come area,24,i am afraid that the tornado is coming to our ...,3.267254,1.0,...,0.136866,0.115179,0.054023,0.0,0.0,0.0,0.0,0.0,0.0,1


## len_text_original

In [49]:
train_original = pd.read_csv('train/train_original.csv')
train_original.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [50]:
len_text = calculate_len(train_original['text'])

pos_col_text = train_original.columns.get_loc('text')+1
train_original.insert(loc=pos_col_text, column='len_text_original', value=len_text)
train_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,69,1
1,4,,,Forest fire near La Ronge Sask. Canada,38,1
2,5,,,All residents asked to 'shelter in place' are ...,133,1
3,6,,,"13,000 people receive #wildfires evacuation or...",65,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,88,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,110,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,95,1
7,13,,,I'm on top of the hill and I can see a fire in...,59,1
8,14,,,There's an emergency evacuation happening now ...,79,1
9,15,,,I'm afraid that the tornado is coming to our a...,52,1


## Juntamos todo

In [51]:
cols = train.columns.tolist()
cols.remove('location')
cols.remove('text_con_stemming')
cols.remove('text_sin_stemming')
cols.remove('keyword_con_stemming')
cols.remove('keyword_sin_stemming')
cols.remove('target')
cols.append('target')
train = train[cols]

In [53]:
train_encoded = train[cols]

pos_col_text = train_encoded.columns.get_loc('len_text')+1
train_encoded.insert(loc=pos_col_text, column='len_text_original', value=train_original['len_text_original'])

In [54]:
train_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=train_encoded['len_text_original']-train_encoded['len_text'])

## Resultado

In [55]:
train_encoded

Unnamed: 0,len_location,len_keyword,len_text,len_text_original,diff_len_text,keyword_value,keyword_sum,keyword_min,keyword_max,keyword_mean,...,text_median_w2v,text_value_1_gram,text_norm_value_1_gram,text_value_2_gram,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram,target
0,0,0,43,69,26,3.267254,1.0,1.0,1.0,1.0,...,0.121727,0.227191,0.077967,0.056394,0.040931,0.000000,0.000000,0.000000,0.000000,1
1,0,0,36,38,2,3.267254,1.0,1.0,1.0,1.0,...,0.055061,0.143954,0.061261,0.056181,0.042664,0.000000,0.000000,0.000000,0.000000,1
2,0,0,72,133,61,3.267254,1.0,1.0,1.0,1.0,...,0.095317,0.313103,0.117570,0.055537,0.055537,0.000000,0.000000,0.000000,0.000000,1
3,0,0,50,65,15,3.267254,1.0,1.0,1.0,1.0,...,0.082558,0.197538,0.078924,0.059248,0.043484,0.000000,0.000000,0.000000,0.000000,1
4,0,0,52,88,36,3.267254,1.0,1.0,1.0,1.0,...,0.086994,0.265268,0.083965,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,50,83,33,3.267254,1.0,1.0,1.0,1.0,...,0.090719,0.252617,0.087472,0.230477,0.095556,0.200416,0.077877,0.179269,0.062804,1
7609,0,0,82,125,43,3.267254,1.0,1.0,1.0,1.0,...,0.158652,0.317482,0.090036,0.290268,0.092334,0.254335,0.087856,0.226830,0.074986,1
7610,0,0,60,65,5,3.267254,1.0,1.0,1.0,1.0,...,0.046951,0.290424,0.150312,0.259849,0.107001,0.231129,0.087631,0.200353,0.075999,1
7611,0,0,96,137,41,3.267254,1.0,1.0,1.0,1.0,...,0.085732,0.489610,0.136034,0.463238,0.137259,0.440136,0.121416,0.403074,0.118529,1


## Guardado del dataframe

In [56]:
train_encoded.to_csv('train/train_encoded.csv', index=False)

# Test

In [57]:
test = pd.read_csv('test/test_limpio.csv')

## keyword

#### Sin stemming

In [58]:
column_values = calculate_sum_of_norms(test['keyword_sin_stemming'].astype('str'), model_google_news)
test['keyword_value'] = [value for value in column_values['sum']]

#### Con stemming

In [59]:
tf_idf_array, vocabulary = tf_idf_encoder(test['keyword_con_stemming'].astype('str'))

column_values = calculate_values(test['keyword_con_stemming'].astype('str'), tf_idf_array, vocabulary)

test['keyword_sum'] = [value for value in column_values['sum']]
test['keyword_min'] = [value for value in column_values['min']]
test['keyword_max'] = [value for value in column_values['max']]
test['keyword_mean'] = [value for value in column_values['mean']]
test['keyword_median'] = [value for value in column_values['median']]

In [60]:
len_keyword = calculate_len(test['keyword_con_stemming'])

pos_col_keyword = test.columns.get_loc('keyword_con_stemming')+1
test.insert(loc=pos_col_keyword, column='len_keyword', value=len_keyword)

## location

##### word2vec

In [61]:
for i in range(1, 4):
    location_as_list = get_n_gram_list(test['location'].astype('str'), i)
    model_location = get_w2v_model(location_as_list)
    
    column_values = calculate_sum_of_norms(test['location'].astype('str'), model_location, i)
    
    column_name = "location_value_" + str(i) + "_gram"
    test[column_name] = [value for value in column_values['sum']]
    
    column_values = calculate_norm_of_sum(test['location'].astype('str'), model_location, i)
    
    column_name = "location_norm_value_" + str(i) + "_gram"
    test[column_name] = [value for value in column_values]

  if word in w2v:
  array_values.append(w2v[word])


In [62]:
column_values = calculate_values_w2v(test['location'].astype('str'), model_location)

test['location_sum_w2v'] = [value for value in column_values['sum']]
test['location_min_w2v'] = [value for value in column_values['min']]
test['location_max_w2v'] = [value for value in column_values['max']]
test['location_mean_w2v'] = [value for value in column_values['mean']]
test['location_median_w2v'] = [value for value in column_values['median']]

  if word in w2v:


##### tf-idf

In [63]:
tf_idf_array, vocabulary = tf_idf_encoder(test['location'].astype('str'))

column_values = calculate_values(test['location'].astype('str'), tf_idf_array, vocabulary)

test['location_sum_tf-idf'] = [value for value in column_values['sum']]
test['location_min_tf-idf'] = [value for value in column_values['min']]
test['location_max_tf-idf'] = [value for value in column_values['max']]
test['location_mean_tf-idf'] = [value for value in column_values['mean']]
test['location_median_tf-idf'] = [value for value in column_values['median']]

In [64]:
len_location = calculate_len(test['location'])

pos_col_location = test.columns.get_loc('location')+1
test.insert(loc=pos_col_location, column='len_location', value=len_location)

## text

#### tf-idf

##### Con stemming

In [65]:
test['text_con_stemming'] = test['text_con_stemming'].astype('str')
test['text_sin_stemming'] = test['text_sin_stemming'].astype('str')

In [66]:
tf_idf_array, vocabulary = tf_idf_encoder(test['text_con_stemming'])

column_values = calculate_values(test['text_con_stemming'], tf_idf_array, vocabulary)

test['text_sum_tf-idf'] = [value for value in column_values['sum']]
test['text_min_tf-idf'] = [value for value in column_values['min']]
test['text_max_tf-idf'] = [value for value in column_values['max']]
test['text_mean_tf-idf'] = [value for value in column_values['mean']]
test['text_median_tf-idf'] = [value for value in column_values['median']]

#### word2vec

##### Sin stemming

In [67]:
column_values = calculate_sum_of_norms(test['text_sin_stemming'], model_google_news)

test['text_value'] = [value for value in column_values['sum']]

In [68]:
column_values = calculate_norm_of_sum(test['text_sin_stemming'], model_google_news)

test['text_norm_value'] = [value for value in column_values]

In [69]:
column_values = calculate_values_w2v(test['text_sin_stemming'], model_google_news)

test['text_sum_w2v'] = [value for value in column_values['sum']]
test['text_min_w2v'] = [value for value in column_values['min']]
test['text_max_w2v'] = [value for value in column_values['max']]
test['text_mean_w2v'] = [value for value in column_values['mean']]
test['text_median_w2v'] = [value for value in column_values['median']]

##### Con stemming

In [70]:
for i in range(1, 5):
    text_as_list = get_n_gram_list(test['text_con_stemming'].astype('str'), i)
    model_text = get_w2v_model(text_as_list)
    
    column_values = calculate_sum_of_norms(test['text_con_stemming'].astype('str'), model_text, i)
    
    column_name = "text_value_" + str(i) + "_gram"
    test[column_name] = [value for value in column_values['sum']]
    
    column_values = calculate_norm_of_sum(test['text_con_stemming'].astype('str'), model_text, i)
    
    column_name = "text_norm_value_" + str(i) + "_gram"
    test[column_name] = [value for value in column_values]

  if word in w2v:
  array_values.append(w2v[word])


## len_text

In [71]:
len_text = calculate_len(test['text_con_stemming'])

pos_col_text = test.columns.get_loc('text_con_stemming')+1
test.insert(loc=pos_col_text, column='len_text', value=len_text)
test.head(10)

Unnamed: 0,id,location,len_location,keyword_sin_stemming,keyword_con_stemming,len_keyword,text_con_stemming,len_text,text_sin_stemming,keyword_value,...,text_mean_w2v,text_median_w2v,text_value_1_gram,text_norm_value_1_gram,text_value_2_gram,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram
0,0,,0,,,0,happen teribl car crash,23,just happened a terible car crash,3.267254,...,0.233879,0.217761,0.114147,0.049523,0.027622,0.027622,0.0,0.0,0.0,0.0
1,2,,0,,,0,heard earthquak differ citi stay safe everyon,45,heard about earthquake is different cities sta...,3.267254,...,0.127819,0.10381,0.211091,0.084403,0.053241,0.032063,0.028167,0.028167,0.0,0.0
2,3,,0,,,0,forest fire spot pond gees flee across street ...,53,there is a forest fire at spot pond geese are ...,3.267254,...,0.106425,0.095932,0.225314,0.06885,0.054941,0.042697,0.0,0.0,0.0,0.0
3,9,,0,,,0,apocalyps light spokan wildfir,30,apocalypse lighting spokane wildfires,3.267254,...,0.130179,0.148877,0.085165,0.051382,0.0,0.0,0.0,0.0,0.0,0.0
4,11,,0,,,0,typhoon soudelor kil number china taiwan,40,typhoon soudelor kils number in china and ta...,3.267254,...,0.071147,0.041079,0.170321,0.064146,0.084029,0.04668,0.0,0.0,0.0,0.0
5,12,,0,,,0,shake earthquak,15,we are shaking it is an earthquake,3.267254,...,0.155288,0.093425,0.028448,0.028448,0.0,0.0,0.0,0.0,0.0,0.0
6,21,,0,,,0,would probabl stil show life arsen yesterday e...,50,they would probably stil show more life than a...,3.267254,...,0.1853,0.137811,0.258151,0.0844,0.027384,0.027384,0.0,0.0,0.0,0.0
7,22,,0,,,0,hey,3,hey how are you,3.267254,...,0.257048,0.215696,0.028073,0.028073,0.0,0.0,0.0,0.0,0.0,0.0
8,27,,0,,,0,nice hat,8,what a nice hat,3.267254,...,0.183303,0.189516,0.057468,0.040742,0.0,0.0,0.0,0.0,0.0,0.0
9,29,,0,,,0,fuck,4,fuck off,3.267254,...,0.164787,0.164787,0.028153,0.028153,0.0,0.0,0.0,0.0,0.0,0.0


## len_text_original

In [72]:
test_original = pd.read_csv('test/test_original.csv')
test_original.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


## Juntamos todo

In [73]:
cols = test.columns.tolist()
cols.remove('location')
cols.remove('text_con_stemming')
cols.remove('text_sin_stemming')
cols.remove('keyword_con_stemming')
cols.remove('keyword_sin_stemming')
test_encoded = test[cols]

In [74]:
len_text = calculate_len(test_original['text'])

pos_col_text = test_original.columns.get_loc('text')+1
test_original.insert(loc=pos_col_text, column='len_text_original', value=len_text)
test_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original
0,0,,,Just happened a terrible car crash,34
1,2,,,"Heard about #earthquake is different cities, s...",64
2,3,,,"there is a forest fire at spot pond, geese are...",96
3,9,,,Apocalypse lighting. #Spokane #wildfires,40
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45
5,12,,,We're shaking...It's an earthquake,34
6,21,,,They'd probably still show more life than Arse...,72
7,22,,,Hey! How are you?,17
8,27,,,What a nice hat?,16
9,29,,,Fuck off!,9


In [75]:
pos_col_text = test_encoded.columns.get_loc('len_text')+1
test_encoded.insert(loc=pos_col_text, column='len_text_original', value=test_original['len_text_original'])

test_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=test_encoded['len_text_original']-test_encoded['len_text'])

## Resultado

In [76]:
test_encoded

Unnamed: 0,id,len_location,len_keyword,len_text,len_text_original,diff_len_text,keyword_value,keyword_sum,keyword_min,keyword_max,...,text_mean_w2v,text_median_w2v,text_value_1_gram,text_norm_value_1_gram,text_value_2_gram,text_norm_value_2_gram,text_value_3_gram,text_norm_value_3_gram,text_value_4_gram,text_norm_value_4_gram
0,0,0,0,23,34,11,3.267254,1.0,1.0,1.0,...,0.233879,0.217761,0.114147,0.049523,0.027622,0.027622,0.000000,0.000000,0.000000,0.000000
1,2,0,0,45,64,19,3.267254,1.0,1.0,1.0,...,0.127819,0.103810,0.211091,0.084403,0.053241,0.032063,0.028167,0.028167,0.000000,0.000000
2,3,0,0,53,96,43,3.267254,1.0,1.0,1.0,...,0.106425,0.095932,0.225314,0.068850,0.054941,0.042697,0.000000,0.000000,0.000000,0.000000
3,9,0,0,30,40,10,3.267254,1.0,1.0,1.0,...,0.130179,0.148877,0.085165,0.051382,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,11,0,0,40,45,5,3.267254,1.0,1.0,1.0,...,0.071147,0.041079,0.170321,0.064146,0.084029,0.046680,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,0,0,44,55,11,3.267254,1.0,1.0,1.0,...,0.163409,0.117514,0.145139,0.068771,0.030796,0.030796,0.000000,0.000000,0.000000,0.000000
3259,10865,0,0,114,139,25,3.267254,1.0,1.0,1.0,...,0.129440,0.112207,0.533441,0.120481,0.145621,0.068241,0.028989,0.028989,0.000000,0.000000
3260,10868,0,0,30,55,25,3.267254,1.0,1.0,1.0,...,0.102561,0.098677,0.145668,0.061666,0.056492,0.040043,0.031140,0.031140,0.000000,0.000000
3261,10874,0,0,40,65,25,3.267254,1.0,1.0,1.0,...,0.082816,0.077447,0.207636,0.079030,0.142495,0.062806,0.116332,0.053774,0.082745,0.053332


## Guardado del dataframe

In [77]:
test_encoded.to_csv('test/test_encoded.csv', index=False)