# Imports

In [1]:
import pandas as pd
import numpy as np
from statistics import mean, median
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import category_encoders as ce #pip install category_encoders
import nltk #pip install nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import gensim
import gensim.downloader as gensim_api

[nltk_data] Downloading package punkt to /home/ben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Funciones

## keyword

In [2]:
def binary_encoder(col_name, col):
    
    ce_bin = ce.BinaryEncoder(cols = [col_name], drop_invariant=True)
    
    return ce_bin.fit_transform(col)

## location

## text

###  tf-idf

In [3]:
def tf_idf_encoder(col_text):
    
    tf_idf_vec = TfidfVectorizer(use_idf=True, smooth_idf=False, ngram_range=(1,1), tokenizer=nltk.word_tokenize)
    tf_idf_data = tf_idf_vec.fit_transform(col_text)
    tf_idf_array = tf_idf_data.toarray()
    vocabulary = tf_idf_vec.vocabulary_
    
    return tf_idf_array, vocabulary

In [4]:
def calculate_values(col, tf_idf_array, vocabulary):

    len_text = len(col)
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_text):
        values = []
        for word in col.iloc[i].split():
            if word in vocabulary:
                pos_word = vocabulary[word]
                values.append(tf_idf_array[i][pos_word])
            
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
        else:
            sum_values.append(0)
            min_values.append(0)
            max_values.append(0)
            mean_values.append(0)
            median_values.append(0)
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

### word2vec

In [5]:
w2v = gensim_api.load("word2vec-google-news-300")

In [6]:
def get_w2v_model(col):
    return gensim.models.word2vec.Word2Vec(col, size=300, window=8, min_count=1, iter=30)

In [7]:
def cos(x, y):
    return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [8]:
def calculate_values_w2v(col):
    
    len_col = len(col)
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_col):
        words = col.iloc[i].split()
        array_values = []
        for word in words:
            if word in w2v:
                array_values.append(w2v[word])
        values = []
        len_array_values = len(array_values)
        for j in range(len_array_values):
            for k in range(j+1, len_array_values):
                values.append(cos(array_values[j], array_values[k]))
                
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
        else:
            sum_values.append(0)
            min_values.append(0)
            max_values.append(0)
            mean_values.append(0)
            median_values.append(0)
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

# Train

In [40]:
train = pd.read_csv('train/train_limpio.csv')

## keyword

In [41]:
tf_idf_array, vocabulary = tf_idf_encoder(train['keyword'].astype('str'))

In [42]:
column_values = calculate_values(train['keyword'].astype('str'), tf_idf_array, vocabulary)

In [43]:
train['keyword_sum'] = [value for value in column_values['sum']]
train['keyword_min'] = [value for value in column_values['min']]
train['keyword_max'] = [value for value in column_values['max']]
train['keyword_mean'] = [value for value in column_values['mean']]
train['keyword_median'] = [value for value in column_values['median']]

In [44]:
pos_col_keyword = train.columns.get_loc('keyword')+1
train.insert(loc=pos_col_keyword, column='len_keyword', value=train['keyword'].iloc[:].str.len())

In [45]:
train.head(10)

Unnamed: 0,id,keyword,len_keyword,location,text,target,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median
0,1,,,,our deeds are the reason of this earthquake ma...,1,1.0,1.0,1.0,1.0,1.0
1,4,,,,forest fire near la ronge sask canada,1,1.0,1.0,1.0,1.0,1.0
2,5,,,,all residents asked to shelter in place are be...,1,1.0,1.0,1.0,1.0,1.0
3,6,,,,13000 people receive wildfires evacuation orde...,1,1.0,1.0,1.0,1.0,1.0
4,7,,,,just got sent this photo from ruby alaska as s...,1,1.0,1.0,1.0,1.0,1.0
5,8,,,,rockyfire update california hwy 20 closed in...,1,1.0,1.0,1.0,1.0,1.0
6,10,,,,flood disaster heavy rain causes flash floodin...,1,1.0,1.0,1.0,1.0,1.0
7,13,,,,i am on top of the hill and i can see a fire i...,1,1.0,1.0,1.0,1.0,1.0
8,14,,,,there is an emergency evacuation happening now...,1,1.0,1.0,1.0,1.0,1.0
9,15,,,,i am afraid that the tornado is coming to our ...,1,1.0,1.0,1.0,1.0,1.0


## location

In [46]:
tf_idf_array, vocabulary = tf_idf_encoder(train['location'].astype('str'))

In [47]:
column_values = calculate_values(train['location'].astype('str'), tf_idf_array, vocabulary)

In [48]:
train['location_sum'] = [value for value in column_values['sum']]
train['location_min'] = [value for value in column_values['min']]
train['location_max'] = [value for value in column_values['max']]
train['location_mean'] = [value for value in column_values['mean']]
train['location_median'] = [value for value in column_values['median']]

In [49]:
pos_col_location = train.columns.get_loc('location')+1
train.insert(loc=pos_col_location, column='len_location', value=train['location'].iloc[:].str.len())

In [50]:
train.head(10)

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,target,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,location_sum,location_min,location_max,location_mean,location_median
0,1,,,,,our deeds are the reason of this earthquake ma...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,4,,,,,forest fire near la ronge sask canada,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,5,,,,,all residents asked to shelter in place are be...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,6,,,,,13000 people receive wildfires evacuation orde...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,7,,,,,just got sent this photo from ruby alaska as s...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,8,,,,,rockyfire update california hwy 20 closed in...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,10,,,,,flood disaster heavy rain causes flash floodin...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,13,,,,,i am on top of the hill and i can see a fire i...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,14,,,,,there is an emergency evacuation happening now...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,15,,,,,i am afraid that the tornado is coming to our ...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## text

#### tf-idf

In [51]:
tf_idf_array, vocabulary = tf_idf_encoder(train['text'])

In [52]:
column_values = calculate_values(train['text'], tf_idf_array, vocabulary)

In [53]:
train['text_sum_tf-idf'] = [value for value in column_values['sum']]
train['text_min_tf-idf'] = [value for value in column_values['min']]
train['text_max_tf-idf'] = [value for value in column_values['max']]
train['text_mean_tf-idf'] = [value for value in column_values['mean']]
train['text_median_tf-idf'] = [value for value in column_values['median']]

In [54]:
train.head(10)

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,target,keyword_sum,keyword_min,keyword_max,...,location_sum,location_min,location_max,location_mean,location_median,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf
0,1,,,,,our deeds are the reason of this earthquake ma...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.350943,0.099866,0.430347,0.257765,0.249176
1,4,,,,,forest fire near la ronge sask canada,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.555363,0.22782,0.504613,0.365052,0.347656
2,5,,,,,all residents asked to shelter in place are be...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,5.104839,0.074545,0.497279,0.232038,0.219919
3,6,,,,,13000 people receive wildfires evacuation orde...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.686376,0.12841,0.484991,0.335797,0.356367
4,7,,,,,just got sent this photo from ruby alaska as s...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.973898,0.096273,0.390788,0.248369,0.241596
5,8,,,,,rockyfire update california hwy 20 closed in...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.820973,0.089499,0.363373,0.238811,0.245703
6,10,,,,,flood disaster heavy rain causes flash floodin...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.576452,0.096857,0.393245,0.255461,0.272769
7,13,,,,,i am on top of the hill and i can see a fire i...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.940281,0.128963,0.487079,0.246268,0.228644
8,14,,,,,there is an emergency evacuation happening now...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.542458,0.131362,0.399979,0.272497,0.25494
9,15,,,,,i am afraid that the tornado is coming to our ...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.050753,0.130843,0.507948,0.277341,0.259876


#### word2vec

In [55]:
column_values = calculate_values_w2v(train['text'])

In [56]:
len(train)

7613

In [57]:
len(column_values['sum'])

7613

In [58]:
train['text_sum_w2v'] = [value for value in column_values['sum']]
train['text_min_w2v'] = [value for value in column_values['min']]
train['text_max_w2v'] = [value for value in column_values['max']]
train['text_mean_w2v'] = [value for value in column_values['mean']]
train['text_median_w2v'] = [value for value in column_values['median']]

In [83]:
train.drop(['keyword', 'location', 'text'], axis=1, inplace=True)
train

Unnamed: 0,id,target,text_sum,text_min,text_max,text_mean,text_median
0,1,1,1.100006,0.004267,0.261477,0.110001,0.108159
1,4,1,1.312549,-0.103024,0.308946,0.087503,0.093828
2,5,1,6.537887,-0.029605,1.000000,0.145286,0.099503
3,6,1,1.225229,0.001371,0.300471,0.122523,0.077621
4,7,1,2.055524,-0.059800,0.332519,0.097882,0.074613
...,...,...,...,...,...,...,...
7608,10869,1,0.893350,0.008029,0.207554,0.089335,0.059363
7609,10870,1,3.022382,0.000419,0.282508,0.107942,0.109766
7610,10871,1,10.551354,-0.082743,1.000000,0.293093,0.017382
7611,10872,1,13.362095,-0.060975,1.000000,0.146836,0.092740


In [86]:
cols = list(train.columns)
cols.remove('target')
cols.append('target')
train = train[cols]

In [87]:
train.to_csv('train/text_encoded_w2v.csv', index=False)

## len_text

In [59]:
pos_col_text = train.columns.get_loc('text')+1
train.insert(loc=pos_col_text, column='len_text', value=train['text'].iloc[:].str.len())
train.head(10)

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,len_text,target,keyword_sum,keyword_min,...,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v
0,1,,,,,our deeds are the reason of this earthquake ma...,68,1,1.0,1.0,...,3.350943,0.099866,0.430347,0.257765,0.249176,11.683651,-0.018116,0.615265,0.177025,0.150866
1,4,,,,,forest fire near la ronge sask canada,38,1,1.0,1.0,...,2.555363,0.22782,0.504613,0.365052,0.347656,0.790154,-0.103024,0.308946,0.079015,0.055061
2,5,,,,,all residents asked to shelter in place are be...,131,1,1.0,1.0,...,5.104839,0.074545,0.497279,0.232038,0.219919,32.468316,-0.044882,1.0,0.154611,0.11367
3,6,,,,,13000 people receive wildfires evacuation orde...,63,1,1.0,1.0,...,2.686376,0.12841,0.484991,0.335797,0.356367,1.943116,-0.039545,0.317247,0.092529,0.078856
4,7,,,,,just got sent this photo from ruby alaska as s...,86,1,1.0,1.0,...,3.973898,0.096273,0.390788,0.248369,0.241596,13.153737,-0.041156,1.0,0.125274,0.086994
5,8,,,,,rockyfire update california hwy 20 closed in...,105,1,1.0,1.0,...,3.820973,0.089499,0.363373,0.238811,0.245703,6.677405,-0.093557,0.478663,0.101173,0.080587
6,10,,,,,flood disaster heavy rain causes flash floodin...,92,1,1.0,1.0,...,3.576452,0.096857,0.393245,0.255461,0.272769,9.341228,-0.047981,0.788929,0.141534,0.109943
7,13,,,,,i am on top of the hill and i can see a fire i...,60,1,1.0,1.0,...,3.940281,0.128963,0.487079,0.246268,0.228644,12.705849,-0.018808,1.0,0.162896,0.129083
8,14,,,,,there is an emergency evacuation happening now...,80,1,1.0,1.0,...,3.542458,0.131362,0.399979,0.272497,0.25494,15.609978,-0.033696,1.0,0.200128,0.158737
9,15,,,,,i am afraid that the tornado is coming to our ...,53,1,1.0,1.0,...,3.050753,0.130843,0.507948,0.277341,0.259876,7.238668,0.004347,0.526257,0.160859,0.136866


## len_text_original

In [60]:
train_original = pd.read_csv('train/train_original.csv')
train_original.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [61]:
pos_col_text = train_original.columns.get_loc('text')+1
train_original.insert(loc=pos_col_text, column='len_text_original', value=train_original['text'].iloc[:].str.len())
train_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,69,1
1,4,,,Forest fire near La Ronge Sask. Canada,38,1
2,5,,,All residents asked to 'shelter in place' are ...,133,1
3,6,,,"13,000 people receive #wildfires evacuation or...",65,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,88,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,110,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,95,1
7,13,,,I'm on top of the hill and I can see a fire in...,59,1
8,14,,,There's an emergency evacuation happening now ...,79,1
9,15,,,I'm afraid that the tornado is coming to our a...,52,1


## join

In [62]:
train.drop(['keyword', 'location', 'text'], axis=1, inplace=True)

In [63]:
train.fillna(0, inplace=True)

In [64]:
train

Unnamed: 0,id,len_keyword,len_location,len_text,target,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,...,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v
0,1,0.0,0.0,68,1,1.0,1.0,1.0,1.0,1.0,...,3.350943,0.099866,0.430347,0.257765,0.249176,11.683651,-0.018116,0.615265,0.177025,0.150866
1,4,0.0,0.0,38,1,1.0,1.0,1.0,1.0,1.0,...,2.555363,0.227820,0.504613,0.365052,0.347656,0.790154,-0.103024,0.308946,0.079015,0.055061
2,5,0.0,0.0,131,1,1.0,1.0,1.0,1.0,1.0,...,5.104839,0.074545,0.497279,0.232038,0.219919,32.468316,-0.044882,1.000000,0.154611,0.113670
3,6,0.0,0.0,63,1,1.0,1.0,1.0,1.0,1.0,...,2.686376,0.128410,0.484991,0.335797,0.356367,1.943116,-0.039545,0.317247,0.092529,0.078856
4,7,0.0,0.0,86,1,1.0,1.0,1.0,1.0,1.0,...,3.973898,0.096273,0.390788,0.248369,0.241596,13.153737,-0.041156,1.000000,0.125274,0.086994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0.0,0.0,61,1,1.0,1.0,1.0,1.0,1.0,...,3.076768,0.128512,0.391212,0.307677,0.318035,4.244816,-0.026625,0.312744,0.117912,0.114414
7609,10870,0.0,0.0,122,1,1.0,1.0,1.0,1.0,1.0,...,4.762887,0.166141,0.352438,0.238144,0.229751,25.374036,-0.006511,1.000000,0.211450,0.158652
7610,10871,0.0,0.0,40,1,1.0,1.0,1.0,1.0,1.0,...,2.895685,0.117186,0.427936,0.321743,0.353433,0.795194,-0.016643,0.215985,0.132532,0.165923
7611,10872,0.0,0.0,137,1,1.0,1.0,1.0,1.0,1.0,...,4.898221,0.070977,0.495946,0.222646,0.194716,24.563957,-0.091922,1.000000,0.116971,0.085238


In [65]:
cols = train.columns.tolist()
cols.remove('target')
cols.append('target')

In [66]:
train_encoded = train[cols]

pos_col_text = train_encoded.columns.get_loc('len_text')+1
train_encoded.insert(loc=pos_col_text, column='len_text_original', value=train_original['len_text_original'])

In [67]:
train_encoded

Unnamed: 0,id,len_keyword,len_location,len_text,len_text_original,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,...,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v,target
0,1,0.0,0.0,68,69,1.0,1.0,1.0,1.0,1.0,...,0.099866,0.430347,0.257765,0.249176,11.683651,-0.018116,0.615265,0.177025,0.150866,1
1,4,0.0,0.0,38,38,1.0,1.0,1.0,1.0,1.0,...,0.227820,0.504613,0.365052,0.347656,0.790154,-0.103024,0.308946,0.079015,0.055061,1
2,5,0.0,0.0,131,133,1.0,1.0,1.0,1.0,1.0,...,0.074545,0.497279,0.232038,0.219919,32.468316,-0.044882,1.000000,0.154611,0.113670,1
3,6,0.0,0.0,63,65,1.0,1.0,1.0,1.0,1.0,...,0.128410,0.484991,0.335797,0.356367,1.943116,-0.039545,0.317247,0.092529,0.078856,1
4,7,0.0,0.0,86,88,1.0,1.0,1.0,1.0,1.0,...,0.096273,0.390788,0.248369,0.241596,13.153737,-0.041156,1.000000,0.125274,0.086994,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0.0,0.0,61,83,1.0,1.0,1.0,1.0,1.0,...,0.128512,0.391212,0.307677,0.318035,4.244816,-0.026625,0.312744,0.117912,0.114414,1
7609,10870,0.0,0.0,122,125,1.0,1.0,1.0,1.0,1.0,...,0.166141,0.352438,0.238144,0.229751,25.374036,-0.006511,1.000000,0.211450,0.158652,1
7610,10871,0.0,0.0,40,65,1.0,1.0,1.0,1.0,1.0,...,0.117186,0.427936,0.321743,0.353433,0.795194,-0.016643,0.215985,0.132532,0.165923,1
7611,10872,0.0,0.0,137,137,1.0,1.0,1.0,1.0,1.0,...,0.070977,0.495946,0.222646,0.194716,24.563957,-0.091922,1.000000,0.116971,0.085238,1


In [68]:
train_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=train_encoded['len_text_original']-train_encoded['len_text'])

## Resultado

In [69]:
train_encoded

Unnamed: 0,id,len_keyword,len_location,len_text,len_text_original,diff_len_text,keyword_sum,keyword_min,keyword_max,keyword_mean,...,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v,target
0,1,0.0,0.0,68,69,1,1.0,1.0,1.0,1.0,...,0.099866,0.430347,0.257765,0.249176,11.683651,-0.018116,0.615265,0.177025,0.150866,1
1,4,0.0,0.0,38,38,0,1.0,1.0,1.0,1.0,...,0.227820,0.504613,0.365052,0.347656,0.790154,-0.103024,0.308946,0.079015,0.055061,1
2,5,0.0,0.0,131,133,2,1.0,1.0,1.0,1.0,...,0.074545,0.497279,0.232038,0.219919,32.468316,-0.044882,1.000000,0.154611,0.113670,1
3,6,0.0,0.0,63,65,2,1.0,1.0,1.0,1.0,...,0.128410,0.484991,0.335797,0.356367,1.943116,-0.039545,0.317247,0.092529,0.078856,1
4,7,0.0,0.0,86,88,2,1.0,1.0,1.0,1.0,...,0.096273,0.390788,0.248369,0.241596,13.153737,-0.041156,1.000000,0.125274,0.086994,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0.0,0.0,61,83,22,1.0,1.0,1.0,1.0,...,0.128512,0.391212,0.307677,0.318035,4.244816,-0.026625,0.312744,0.117912,0.114414,1
7609,10870,0.0,0.0,122,125,3,1.0,1.0,1.0,1.0,...,0.166141,0.352438,0.238144,0.229751,25.374036,-0.006511,1.000000,0.211450,0.158652,1
7610,10871,0.0,0.0,40,65,25,1.0,1.0,1.0,1.0,...,0.117186,0.427936,0.321743,0.353433,0.795194,-0.016643,0.215985,0.132532,0.165923,1
7611,10872,0.0,0.0,137,137,0,1.0,1.0,1.0,1.0,...,0.070977,0.495946,0.222646,0.194716,24.563957,-0.091922,1.000000,0.116971,0.085238,1


## Guardado del dataframe

In [70]:
train_encoded.to_csv('train/train_encoded.csv', index=False)

# Test

In [71]:
test = pd.read_csv('test/test_limpio.csv')

## keyword

In [72]:
tf_idf_array, vocabulary = tf_idf_encoder(test['keyword'].astype('str'))

In [73]:
column_values = calculate_values(test['keyword'].astype('str'), tf_idf_array, vocabulary)

In [74]:
test['keyword_sum'] = [value for value in column_values['sum']]
test['keyword_min'] = [value for value in column_values['min']]
test['keyword_max'] = [value for value in column_values['max']]
test['keyword_mean'] = [value for value in column_values['mean']]
test['keyword_median'] = [value for value in column_values['median']]

In [75]:
pos_col_keyword = test.columns.get_loc('keyword')+1
test.insert(loc=pos_col_keyword, column='len_keyword', value=test['keyword'].iloc[:].str.len())

## location

In [76]:
tf_idf_array, vocabulary = tf_idf_encoder(test['location'].astype('str'))

In [77]:
column_values = calculate_values(test['location'].astype('str'), tf_idf_array, vocabulary)

In [78]:
test['location_sum'] = [value for value in column_values['sum']]
test['location_min'] = [value for value in column_values['min']]
test['location_max'] = [value for value in column_values['max']]
test['location_mean'] = [value for value in column_values['mean']]
test['location_median'] = [value for value in column_values['median']]

In [79]:
pos_col_location = test.columns.get_loc('location')+1
test.insert(loc=pos_col_location, column='len_location', value=test['location'].iloc[:].str.len())

## text

#### tf-idf

In [80]:
test.text = test.text.astype('str')

In [81]:
test.text

0                      just happened a terrible car crash
1       heard about earthquake is different cities sta...
2       there is a forest fire at spot pond geese are ...
3                  apocalypse lighting  spokane wildfires
4           typhoon soudelor kills 28 in china and taiwan
                              ...                        
3258    earthquake safety los angeles   safety fastene...
3259    storm in ri worse than last hurricane  my city...
3260                    green line derailment in chicago 
3261            meg issues hazardous weather outlook hwo 
3262    cityofcalgary has activated its municipal emer...
Name: text, Length: 3263, dtype: object

In [82]:
tf_idf_array, vocabulary = tf_idf_encoder(test['text'])

In [83]:
column_values = calculate_values(test['text'], tf_idf_array, vocabulary)

In [84]:
test['text_sum_tf-idf'] = [value for value in column_values['sum']]
test['text_min_tf-idf'] = [value for value in column_values['min']]
test['text_max_tf-idf'] = [value for value in column_values['max']]
test['text_mean_tf-idf'] = [value for value in column_values['mean']]
test['text_median_tf-idf'] = [value for value in column_values['median']]

In [85]:
test

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,keyword_sum,keyword_min,keyword_max,keyword_mean,...,location_sum,location_min,location_max,location_mean,location_median,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf
0,0,,,,,just happened a terrible car crash,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.350852,0.188729,0.539327,0.391809,0.405420
1,2,,,,,heard about earthquake is different cities sta...,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.907499,0.134557,0.424909,0.323055,0.335907
2,3,,,,,there is a forest fire at spot pond geese are ...,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.829908,0.087343,0.366435,0.212773,0.202162
3,9,,,,,apocalypse lighting spokane wildfires,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.977999,0.387866,0.583183,0.494500,0.503475
4,11,,,,,typhoon soudelor kills 28 in china and taiwan,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.676265,0.145303,0.467023,0.334533,0.357812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,,,,earthquake safety los angeles safety fastene...,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.967565,0.266603,0.585509,0.423938,0.397647
3259,10865,,,,,storm in ri worse than last hurricane my city...,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,4.740478,0.081635,0.298454,0.206108,0.205435
3260,10868,,,,,green line derailment in chicago,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.146034,0.182383,0.524052,0.429207,0.490899
3261,10874,,,,,meg issues hazardous weather outlook hwo,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.431532,0.342742,0.468289,0.405255,0.409449


#### word2vec

In [86]:
column_values = calculate_values_w2v(test['text'])

In [87]:
test['text_sum_w2v'] = [value for value in column_values['sum']]
test['text_min_w2v'] = [value for value in column_values['min']]
test['text_max_w2v'] = [value for value in column_values['max']]
test['text_mean_w2v'] = [value for value in column_values['mean']]
test['text_median_w2v'] = [value for value in column_values['median']]

## len_text

In [88]:
pos_col_text = test.columns.get_loc('text')+1
test.insert(loc=pos_col_text, column='len_text', value=test['text'].iloc[:].str.len().astype('int64'))
test.head(10)

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,len_text,keyword_sum,keyword_min,keyword_max,...,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v
0,0,,,,,just happened a terrible car crash,34,1.0,1.0,1.0,...,2.350852,0.188729,0.539327,0.391809,0.40542,2.615612,0.123642,0.416246,0.261561,0.253108
1,2,,,,,heard about earthquake is different cities sta...,62,1.0,1.0,1.0,...,2.907499,0.134557,0.424909,0.323055,0.335907,4.601476,0.020386,0.308944,0.127819,0.10381
2,3,,,,,there is a forest fire at spot pond geese are ...,94,1.0,1.0,1.0,...,3.829908,0.087343,0.366435,0.212773,0.202162,18.687548,-0.060702,0.536185,0.122141,0.104698
3,9,,,,,apocalypse lighting spokane wildfires,38,1.0,1.0,1.0,...,1.977999,0.387866,0.583183,0.4945,0.503475,0.390538,0.065934,0.175728,0.130179,0.148877
4,11,,,,,typhoon soudelor kills 28 in china and taiwan,45,1.0,1.0,1.0,...,2.676265,0.145303,0.467023,0.334533,0.357812,0.780736,-0.023094,0.283668,0.078074,0.060852
5,12,,,,,we are shaking it is an earthquake,36,1.0,1.0,1.0,...,2.433109,0.184247,0.661802,0.347587,0.311464,3.261046,-0.091945,0.536185,0.155288,0.093425
6,21,,,,,they would probably still show more life than ...,73,1.0,1.0,1.0,...,3.780986,0.177587,0.639809,0.290845,0.219339,15.150403,-0.038854,1.0,0.194236,0.143367
7,22,,,,,hey how are you,15,1.0,1.0,1.0,...,1.90829,0.351061,0.72657,0.477073,0.415329,1.54229,0.097143,0.555028,0.257048,0.215696
8,27,,,,,what a nice hat,15,1.0,1.0,1.0,...,1.88457,0.223933,0.653736,0.471142,0.50345,0.549909,0.086593,0.2738,0.183303,0.189516
9,29,,,,,fuck off,8,1.0,1.0,1.0,...,1.406949,0.631894,0.775055,0.703474,0.703474,0.164787,0.164787,0.164787,0.164787,0.164787


## join

In [89]:
test_original = pd.read_csv('test/test_original.csv')
test_original.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [90]:
test_encoded = test
pos_col_text = test_original.columns.get_loc('text')+1
test_original.insert(loc=pos_col_text, column='len_text_original', value=test_original['text'].iloc[:].str.len())
test_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original
0,0,,,Just happened a terrible car crash,34
1,2,,,"Heard about #earthquake is different cities, s...",64
2,3,,,"there is a forest fire at spot pond, geese are...",96
3,9,,,Apocalypse lighting. #Spokane #wildfires,40
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45
5,12,,,We're shaking...It's an earthquake,34
6,21,,,They'd probably still show more life than Arse...,72
7,22,,,Hey! How are you?,17
8,27,,,What a nice hat?,16
9,29,,,Fuck off!,9


In [91]:
pos_col_text = test_encoded.columns.get_loc('len_text')+1
test_encoded.insert(loc=pos_col_text, column='len_text_original', value=test_original['len_text_original'])

In [92]:
test_encoded

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,len_text,len_text_original,keyword_sum,keyword_min,...,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v
0,0,,,,,just happened a terrible car crash,34,34,1.0,1.0,...,2.350852,0.188729,0.539327,0.391809,0.405420,2.615612,0.123642,0.416246,0.261561,0.253108
1,2,,,,,heard about earthquake is different cities sta...,62,64,1.0,1.0,...,2.907499,0.134557,0.424909,0.323055,0.335907,4.601476,0.020386,0.308944,0.127819,0.103810
2,3,,,,,there is a forest fire at spot pond geese are ...,94,96,1.0,1.0,...,3.829908,0.087343,0.366435,0.212773,0.202162,18.687548,-0.060702,0.536185,0.122141,0.104698
3,9,,,,,apocalypse lighting spokane wildfires,38,40,1.0,1.0,...,1.977999,0.387866,0.583183,0.494500,0.503475,0.390538,0.065934,0.175728,0.130179,0.148877
4,11,,,,,typhoon soudelor kills 28 in china and taiwan,45,45,1.0,1.0,...,2.676265,0.145303,0.467023,0.334533,0.357812,0.780736,-0.023094,0.283668,0.078074,0.060852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,,,,earthquake safety los angeles safety fastene...,53,55,1.0,1.0,...,2.967565,0.266603,0.585509,0.423938,0.397647,2.451140,-0.074862,1.000000,0.163409,0.117514
3259,10865,,,,,storm in ri worse than last hurricane my city...,137,139,1.0,1.0,...,4.740478,0.081635,0.298454,0.206108,0.205435,31.312941,-0.051032,1.000000,0.149109,0.126693
3260,10868,,,,,green line derailment in chicago,33,55,1.0,1.0,...,2.146034,0.182383,0.524052,0.429207,0.490899,0.801203,0.009149,0.149543,0.080120,0.074443
3261,10874,,,,,meg issues hazardous weather outlook hwo,41,65,1.0,1.0,...,2.431532,0.342742,0.468289,0.405255,0.409449,1.356770,-0.046756,0.261727,0.090451,0.096737


In [93]:
test_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=test_encoded['len_text_original']-test_encoded['len_text'])

In [94]:
test_encoded.fillna(0, inplace=True)

In [95]:
test_encoded.drop(['keyword', 'location', 'text'], axis=1, inplace=True)

## Resultado

In [96]:
test_encoded

Unnamed: 0,id,len_keyword,len_location,len_text,len_text_original,diff_len_text,keyword_sum,keyword_min,keyword_max,keyword_mean,...,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v
0,0,0.0,0.0,34,34,0,1.0,1.0,1.0,1.0,...,2.350852,0.188729,0.539327,0.391809,0.405420,2.615612,0.123642,0.416246,0.261561,0.253108
1,2,0.0,0.0,62,64,2,1.0,1.0,1.0,1.0,...,2.907499,0.134557,0.424909,0.323055,0.335907,4.601476,0.020386,0.308944,0.127819,0.103810
2,3,0.0,0.0,94,96,2,1.0,1.0,1.0,1.0,...,3.829908,0.087343,0.366435,0.212773,0.202162,18.687548,-0.060702,0.536185,0.122141,0.104698
3,9,0.0,0.0,38,40,2,1.0,1.0,1.0,1.0,...,1.977999,0.387866,0.583183,0.494500,0.503475,0.390538,0.065934,0.175728,0.130179,0.148877
4,11,0.0,0.0,45,45,0,1.0,1.0,1.0,1.0,...,2.676265,0.145303,0.467023,0.334533,0.357812,0.780736,-0.023094,0.283668,0.078074,0.060852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,0.0,0.0,53,55,2,1.0,1.0,1.0,1.0,...,2.967565,0.266603,0.585509,0.423938,0.397647,2.451140,-0.074862,1.000000,0.163409,0.117514
3259,10865,0.0,0.0,137,139,2,1.0,1.0,1.0,1.0,...,4.740478,0.081635,0.298454,0.206108,0.205435,31.312941,-0.051032,1.000000,0.149109,0.126693
3260,10868,0.0,0.0,33,55,22,1.0,1.0,1.0,1.0,...,2.146034,0.182383,0.524052,0.429207,0.490899,0.801203,0.009149,0.149543,0.080120,0.074443
3261,10874,0.0,0.0,41,65,24,1.0,1.0,1.0,1.0,...,2.431532,0.342742,0.468289,0.405255,0.409449,1.356770,-0.046756,0.261727,0.090451,0.096737


## Guardado del dataframe

In [97]:
test_encoded.to_csv('test/test_encoded.csv', index=False)