# Imports

In [103]:
import pandas as pd
import numpy as np
from statistics import mean, median
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import category_encoders as ce #pip install category_encoders
import nltk #pip install nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import gensim
import gensim.downloader as gensim_api

[nltk_data] Downloading package punkt to /home/ben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Funciones

## keyword

In [None]:
def binary_encoder(col_name, col):
    
    ce_bin = ce.BinaryEncoder(cols = [col_name], drop_invariant=True)
    
    return ce_bin.fit_transform(col)

## location

In [None]:
nlp = gensim_api.load("word2vec-google-news-300")
def get_w2v_model(col):
    return gensim.models.word2vec.Word2Vec(col, size=300, window=8, min_count=1, sg=1, iter=30)


## text

In [104]:
def tf_idf_encoder(col_text):
    
    tf_idf_vec = TfidfVectorizer(use_idf=True, smooth_idf=False, ngram_range=(1,1), tokenizer=nltk.word_tokenize)
    tf_idf_data = tf_idf_vec.fit_transform(col_text)
    tf_idf_array = tf_idf_data.toarray()
    vocabulary = tf_idf_vec.vocabulary_
    
    return tf_idf_array, vocabulary

In [105]:
def calculate_values(col, tf_idf_array, vocabulary):

    len_text = len(col)
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_text):
        values = []
        for word in col.iloc[i].split():
            if word in vocabulary:
                pos_word = vocabulary[word]
                values.append(tf_idf_array[i][pos_word])
            
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
        else:
            sum_values.append(0)
            min_values.append(0)
            max_values.append(0)
            mean_values.append(0)
            median_values.append(0)
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

# Train

In [106]:
train = pd.read_csv('train/train_limpio.csv')

## keyword

In [107]:
tf_idf_array, vocabulary = tf_idf_encoder(train['keyword'].astype('str'))

In [108]:
column_values = calculate_values(train['keyword'].astype('str'), tf_idf_array, vocabulary)

In [109]:
train['keyword_sum'] = [value for value in column_values['sum']]
train['keyword_min'] = [value for value in column_values['min']]
train['keyword_max'] = [value for value in column_values['max']]
train['keyword_mean'] = [value for value in column_values['mean']]
train['keyword_median'] = [value for value in column_values['median']]

In [110]:
pos_col_keyword = train.columns.get_loc('keyword')+1
train.insert(loc=pos_col_keyword, column='len_keyword', value=train['keyword'].iloc[:].str.len())

In [111]:
train.head(10)

Unnamed: 0,id,keyword,len_keyword,location,text,target,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median
0,1,,,,deed reason earthquak may alah forgiv us al,1,1.0,1.0,1.0,1.0,1.0
1,4,,,,forest fire near la rong sask canada,1,1.0,1.0,1.0,1.0,1.0
2,5,,,,al resid ask shelter place notifi offic evacu ...,1,1.0,1.0,1.0,1.0,1.0
3,6,,,,number peopl receiv wildfir evacu order califo...,1,1.0,1.0,1.0,1.0,1.0
4,7,,,,got sent photo rubi alaska smoke wildfir pour ...,1,1.0,1.0,1.0,1.0,1.0
5,8,,,,rockyfir updat california hwi number close dir...,1,1.0,1.0,1.0,1.0,1.0
6,10,,,,flood disast heavi rain caus flash flood stree...,1,1.0,1.0,1.0,1.0,1.0
7,13,,,,top hil see fire wood,1,1.0,1.0,1.0,1.0,1.0
8,14,,,,emerg evacu happen build across street,1,1.0,1.0,1.0,1.0,1.0
9,15,,,,afraid tornado come area,1,1.0,1.0,1.0,1.0,1.0


## location

In [112]:
tf_idf_array, vocabulary = tf_idf_encoder(train['location'].astype('str'))

In [113]:
column_values = calculate_values(train['location'].astype('str'), tf_idf_array, vocabulary)

In [114]:
train['location_sum'] = [value for value in column_values['sum']]
train['location_min'] = [value for value in column_values['min']]
train['location_max'] = [value for value in column_values['max']]
train['location_mean'] = [value for value in column_values['mean']]
train['location_median'] = [value for value in column_values['median']]

In [115]:
pos_col_location = train.columns.get_loc('location')+1
train.insert(loc=pos_col_location, column='len_location', value=train['location'].iloc[:].str.len())

In [116]:
train.head(10)

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,target,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,location_sum,location_min,location_max,location_mean,location_median
0,1,,,,,deed reason earthquak may alah forgiv us al,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,4,,,,,forest fire near la rong sask canada,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,5,,,,,al resid ask shelter place notifi offic evacu ...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,6,,,,,number peopl receiv wildfir evacu order califo...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,7,,,,,got sent photo rubi alaska smoke wildfir pour ...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,8,,,,,rockyfir updat california hwi number close dir...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,10,,,,,flood disast heavi rain caus flash flood stree...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,13,,,,,top hil see fire wood,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,14,,,,,emerg evacu happen build across street,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,15,,,,,afraid tornado come area,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## text

#### tf-idf

In [117]:
tf_idf_array, vocabulary = tf_idf_encoder(train['text'])

In [118]:
column_values = calculate_values(train['text'], tf_idf_array, vocabulary)

In [119]:
train['text_sum'] = [value for value in column_values['sum']]
train['text_min'] = [value for value in column_values['min']]
train['text_max'] = [value for value in column_values['max']]
train['text_mean'] = [value for value in column_values['mean']]
train['text_median'] = [value for value in column_values['median']]

In [120]:
train.head(10)

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,target,keyword_sum,keyword_min,keyword_max,...,location_sum,location_min,location_max,location_mean,location_median,text_sum,text_min,text_max,text_mean,text_median
0,1,,,,,deed reason earthquak may alah forgiv us al,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.745943,0.224894,0.474996,0.343243,0.323476
1,4,,,,,forest fire near la rong sask canada,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.545323,0.21165,0.508967,0.363618,0.340166
2,5,,,,,al resid ask shelter place notifi offic evacu ...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.92501,0.153758,0.585124,0.327084,0.256037
3,6,,,,,number peopl receiv wildfir evacu order califo...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.515355,0.150085,0.574226,0.359336,0.346962
4,7,,,,,got sent photo rubi alaska smoke wildfir pour ...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.937988,0.247769,0.444024,0.326443,0.287533
5,8,,,,,rockyfir updat california hwi number close dir...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.47217,0.094812,0.392155,0.26709,0.268381
6,10,,,,,flood disast heavi rain caus flash flood stree...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.652263,0.211903,0.421501,0.304355,0.292368
7,13,,,,,top hil see fire wood,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.1694,0.288043,0.580492,0.43388,0.410864
8,14,,,,,emerg evacu happen build across street,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.425554,0.344773,0.491582,0.404259,0.389401
9,15,,,,,afraid tornado come area,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.966376,0.391599,0.639305,0.491594,0.467736


## len_text

In [121]:
pos_col_text = train.columns.get_loc('text')+1
train.insert(loc=pos_col_text, column='len_text', value=train['text'].iloc[:].str.len())
train.head(10)

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,len_text,target,keyword_sum,keyword_min,...,location_sum,location_min,location_max,location_mean,location_median,text_sum,text_min,text_max,text_mean,text_median
0,1,,,,,deed reason earthquak may alah forgiv us al,43,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.745943,0.224894,0.474996,0.343243,0.323476
1,4,,,,,forest fire near la rong sask canada,36,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.545323,0.21165,0.508967,0.363618,0.340166
2,5,,,,,al resid ask shelter place notifi offic evacu ...,72,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.92501,0.153758,0.585124,0.327084,0.256037
3,6,,,,,number peopl receiv wildfir evacu order califo...,50,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.515355,0.150085,0.574226,0.359336,0.346962
4,7,,,,,got sent photo rubi alaska smoke wildfir pour ...,52,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.937988,0.247769,0.444024,0.326443,0.287533
5,8,,,,,rockyfir updat california hwi number close dir...,84,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.47217,0.094812,0.392155,0.26709,0.268381
6,10,,,,,flood disast heavi rain caus flash flood stree...,74,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.652263,0.211903,0.421501,0.304355,0.292368
7,13,,,,,top hil see fire wood,21,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.1694,0.288043,0.580492,0.43388,0.410864
8,14,,,,,emerg evacu happen build across street,38,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.425554,0.344773,0.491582,0.404259,0.389401
9,15,,,,,afraid tornado come area,24,1,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.966376,0.391599,0.639305,0.491594,0.467736


## len_text_original

In [122]:
train_original = pd.read_csv('train/train_original.csv')
train_original.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [123]:
pos_col_text = train_original.columns.get_loc('text')+1
train_original.insert(loc=pos_col_text, column='len_text_original', value=train_original['text'].iloc[:].str.len())
train_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,69,1
1,4,,,Forest fire near La Ronge Sask. Canada,38,1
2,5,,,All residents asked to 'shelter in place' are ...,133,1
3,6,,,"13,000 people receive #wildfires evacuation or...",65,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,88,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,110,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,95,1
7,13,,,I'm on top of the hill and I can see a fire in...,59,1
8,14,,,There's an emergency evacuation happening now ...,79,1
9,15,,,I'm afraid that the tornado is coming to our a...,52,1


## join

In [124]:
train.drop(['keyword', 'location', 'text'], axis=1, inplace=True)

In [125]:
train.fillna(0, inplace=True)

In [126]:
train

Unnamed: 0,id,len_keyword,len_location,len_text,target,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,location_sum,location_min,location_max,location_mean,location_median,text_sum,text_min,text_max,text_mean,text_median
0,1,0.0,0.0,43,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.745943,0.224894,0.474996,0.343243,0.323476
1,4,0.0,0.0,36,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.545323,0.211650,0.508967,0.363618,0.340166
2,5,0.0,0.0,72,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.925010,0.153758,0.585124,0.327084,0.256037
3,6,0.0,0.0,50,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.515355,0.150085,0.574226,0.359336,0.346962
4,7,0.0,0.0,52,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.937988,0.247769,0.444024,0.326443,0.287533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0.0,0.0,45,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.803465,0.284299,0.418945,0.350433,0.364511
7609,10870,0.0,0.0,82,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.228988,0.190353,0.425825,0.293544,0.279435
7610,10871,0.0,0.0,55,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.698983,0.366487,0.629740,0.522109,0.629740
7611,10872,0.0,0.0,96,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.510709,0.157250,0.529638,0.265336,0.228318


In [127]:
cols = train.columns.tolist()
cols.remove('target')
cols.append('target')

In [128]:
train_encoded = train[cols]

pos_col_text = train_encoded.columns.get_loc('len_text')+1
train_encoded.insert(loc=pos_col_text, column='len_text_original', value=train_original['len_text_original'])

In [129]:
train_encoded

Unnamed: 0,id,len_keyword,len_location,len_text,len_text_original,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,...,location_min,location_max,location_mean,location_median,text_sum,text_min,text_max,text_mean,text_median,target
0,1,0.0,0.0,43,69,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.745943,0.224894,0.474996,0.343243,0.323476,1
1,4,0.0,0.0,36,38,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.545323,0.211650,0.508967,0.363618,0.340166,1
2,5,0.0,0.0,72,133,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,3.925010,0.153758,0.585124,0.327084,0.256037,1
3,6,0.0,0.0,50,65,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.515355,0.150085,0.574226,0.359336,0.346962,1
4,7,0.0,0.0,52,88,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.937988,0.247769,0.444024,0.326443,0.287533,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0.0,0.0,45,83,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.803465,0.284299,0.418945,0.350433,0.364511,1
7609,10870,0.0,0.0,82,125,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,3.228988,0.190353,0.425825,0.293544,0.279435,1
7610,10871,0.0,0.0,55,65,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,4.698983,0.366487,0.629740,0.522109,0.629740,1
7611,10872,0.0,0.0,96,137,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,4.510709,0.157250,0.529638,0.265336,0.228318,1


In [130]:
train_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=train_encoded['len_text_original']-train_encoded['len_text'])

## Resultado

In [131]:
train_encoded

Unnamed: 0,id,len_keyword,len_location,len_text,len_text_original,diff_len_text,keyword_sum,keyword_min,keyword_max,keyword_mean,...,location_min,location_max,location_mean,location_median,text_sum,text_min,text_max,text_mean,text_median,target
0,1,0.0,0.0,43,69,26,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.745943,0.224894,0.474996,0.343243,0.323476,1
1,4,0.0,0.0,36,38,2,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.545323,0.211650,0.508967,0.363618,0.340166,1
2,5,0.0,0.0,72,133,61,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,3.925010,0.153758,0.585124,0.327084,0.256037,1
3,6,0.0,0.0,50,65,15,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.515355,0.150085,0.574226,0.359336,0.346962,1
4,7,0.0,0.0,52,88,36,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.937988,0.247769,0.444024,0.326443,0.287533,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,10869,0.0,0.0,45,83,38,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,2.803465,0.284299,0.418945,0.350433,0.364511,1
7609,10870,0.0,0.0,82,125,43,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,3.228988,0.190353,0.425825,0.293544,0.279435,1
7610,10871,0.0,0.0,55,65,10,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,4.698983,0.366487,0.629740,0.522109,0.629740,1
7611,10872,0.0,0.0,96,137,41,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,4.510709,0.157250,0.529638,0.265336,0.228318,1


## Guardado del dataframe

In [132]:
train_encoded.to_csv('train/train_encoded.csv', index=False)

# Test

In [133]:
test = pd.read_csv('test/test_limpio.csv')

## keyword

In [134]:
tf_idf_array, vocabulary = tf_idf_encoder(test['keyword'].astype('str'))

In [135]:
column_values = calculate_values(test['keyword'].astype('str'), tf_idf_array, vocabulary)

In [136]:
test['keyword_sum'] = [value for value in column_values['sum']]
test['keyword_min'] = [value for value in column_values['min']]
test['keyword_max'] = [value for value in column_values['max']]
test['keyword_mean'] = [value for value in column_values['mean']]
test['keyword_median'] = [value for value in column_values['median']]

In [137]:
pos_col_keyword = test.columns.get_loc('keyword')+1
test.insert(loc=pos_col_keyword, column='len_keyword', value=test['keyword'].iloc[:].str.len())

## location

In [138]:
tf_idf_array, vocabulary = tf_idf_encoder(test['location'].astype('str'))

In [139]:
column_values = calculate_values(test['location'].astype('str'), tf_idf_array, vocabulary)

In [140]:
test['location_sum'] = [value for value in column_values['sum']]
test['location_min'] = [value for value in column_values['min']]
test['location_max'] = [value for value in column_values['max']]
test['location_mean'] = [value for value in column_values['mean']]
test['location_median'] = [value for value in column_values['median']]

In [141]:
pos_col_location = test.columns.get_loc('location')+1
test.insert(loc=pos_col_location, column='len_location', value=test['location'].iloc[:].str.len())

## text

In [142]:
test.text = test.text.astype('str')

In [143]:
test.text

0                                 happen teribl car crash
1           heard earthquak differ citi stay safe everyon
2       forest fire spot pond gees flee across street ...
3                          apocalyps light spokan wildfir
4                typhoon soudelor kil number china taiwan
                              ...                        
3258         earthquak safeti lo angel safeti fasten xrwn
3259    storm ri wors last hurican cityand number othe...
3260                            green line derail chicago
3261                  meg issu hazard weather outlook hwo
3262       cityofcalgari activ municip emerg plan ycstorm
Name: text, Length: 3263, dtype: object

In [144]:
tf_idf_array, vocabulary = tf_idf_encoder(test['text'])

In [145]:
column_values = calculate_values(test['text'], tf_idf_array, vocabulary)

In [146]:
test['text_sum'] = [value for value in column_values['sum']]
test['text_min'] = [value for value in column_values['min']]
test['text_max'] = [value for value in column_values['max']]
test['text_mean'] = [value for value in column_values['mean']]
test['text_median'] = [value for value in column_values['median']]

In [147]:
test

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,keyword_sum,keyword_min,keyword_max,keyword_mean,...,location_sum,location_min,location_max,location_mean,location_median,text_sum,text_min,text_max,text_mean,text_median
0,0,,,,,happen teribl car crash,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.979137,0.424578,0.615103,0.494784,0.469728
1,2,,,,,heard earthquak differ citi stay safe everyon,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.638767,0.342961,0.426320,0.376967,0.369029
2,3,,,,,forest fire spot pond gees flee across street ...,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.066710,0.187362,0.422860,0.306671,0.307269
3,9,,,,,apocalyps light spokan wildfir,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.958828,0.407703,0.662541,0.489707,0.444292
4,11,,,,,typhoon soudelor kil number china taiwan,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.346990,0.151011,0.524219,0.391165,0.416631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,,,,earthquak safeti lo angel safeti fasten xrwn,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.968900,0.271113,0.595414,0.424129,0.404374
3259,10865,,,,,storm ri wors last hurican cityand number othe...,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,4.407768,0.140259,0.321931,0.220388,0.214111
3260,10868,,,,,green line derail chicago,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.990434,0.426605,0.551133,0.497608,0.506348
3261,10874,,,,,meg issu hazard weather outlook hwo,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.418856,0.324695,0.482028,0.403143,0.405775


## len_text

In [148]:
pos_col_text = test.columns.get_loc('text')+1
test.insert(loc=pos_col_text, column='len_text', value=test['text'].iloc[:].str.len().astype('int64'))
test.head(10)

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,len_text,keyword_sum,keyword_min,keyword_max,...,location_sum,location_min,location_max,location_mean,location_median,text_sum,text_min,text_max,text_mean,text_median
0,0,,,,,happen teribl car crash,23,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.979137,0.424578,0.615103,0.494784,0.469728
1,2,,,,,heard earthquak differ citi stay safe everyon,45,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.638767,0.342961,0.42632,0.376967,0.369029
2,3,,,,,forest fire spot pond gees flee across street ...,53,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.06671,0.187362,0.42286,0.306671,0.307269
3,9,,,,,apocalyps light spokan wildfir,30,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.958828,0.407703,0.662541,0.489707,0.444292
4,11,,,,,typhoon soudelor kil number china taiwan,40,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.34699,0.151011,0.524219,0.391165,0.416631
5,12,,,,,shake earthquak,15,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.387471,0.556874,0.830597,0.693736,0.693736
6,21,,,,,would probabl stil show life arsen yesterday e...,50,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.254028,0.201604,0.700242,0.361559,0.282634
7,22,,,,,hey,3,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,27,,,,,nice hat,8,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.413128,0.678867,0.734261,0.706564,0.706564
9,29,,,,,fuck,4,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## join

In [149]:
test_original = pd.read_csv('test/test_original.csv')
test_original.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [150]:
test_encoded = test
pos_col_text = test_original.columns.get_loc('text')+1
test_original.insert(loc=pos_col_text, column='len_text_original', value=test_original['text'].iloc[:].str.len())
test_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original
0,0,,,Just happened a terrible car crash,34
1,2,,,"Heard about #earthquake is different cities, s...",64
2,3,,,"there is a forest fire at spot pond, geese are...",96
3,9,,,Apocalypse lighting. #Spokane #wildfires,40
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45
5,12,,,We're shaking...It's an earthquake,34
6,21,,,They'd probably still show more life than Arse...,72
7,22,,,Hey! How are you?,17
8,27,,,What a nice hat?,16
9,29,,,Fuck off!,9


In [151]:
pos_col_text = test_encoded.columns.get_loc('len_text')+1
test_encoded.insert(loc=pos_col_text, column='len_text_original', value=test_original['len_text_original'])

In [152]:
test_encoded

Unnamed: 0,id,keyword,len_keyword,location,len_location,text,len_text,len_text_original,keyword_sum,keyword_min,...,location_sum,location_min,location_max,location_mean,location_median,text_sum,text_min,text_max,text_mean,text_median
0,0,,,,,happen teribl car crash,23,34,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.979137,0.424578,0.615103,0.494784,0.469728
1,2,,,,,heard earthquak differ citi stay safe everyon,45,64,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.638767,0.342961,0.426320,0.376967,0.369029
2,3,,,,,forest fire spot pond gees flee across street ...,53,96,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.066710,0.187362,0.422860,0.306671,0.307269
3,9,,,,,apocalyps light spokan wildfir,30,40,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.958828,0.407703,0.662541,0.489707,0.444292
4,11,,,,,typhoon soudelor kil number china taiwan,40,45,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.346990,0.151011,0.524219,0.391165,0.416631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,,,,earthquak safeti lo angel safeti fasten xrwn,44,55,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.968900,0.271113,0.595414,0.424129,0.404374
3259,10865,,,,,storm ri wors last hurican cityand number othe...,114,139,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,4.407768,0.140259,0.321931,0.220388,0.214111
3260,10868,,,,,green line derail chicago,25,55,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.990434,0.426605,0.551133,0.497608,0.506348
3261,10874,,,,,meg issu hazard weather outlook hwo,35,65,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.418856,0.324695,0.482028,0.403143,0.405775


In [153]:
test_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=test_encoded['len_text_original']-test_encoded['len_text'])

In [154]:
test_encoded.fillna(0, inplace=True)

In [155]:
test_encoded.drop(['keyword', 'location', 'text'], axis=1, inplace=True)

## Resultado

In [156]:
test_encoded

Unnamed: 0,id,len_keyword,len_location,len_text,len_text_original,diff_len_text,keyword_sum,keyword_min,keyword_max,keyword_mean,...,location_sum,location_min,location_max,location_mean,location_median,text_sum,text_min,text_max,text_mean,text_median
0,0,0.0,0.0,23,34,11,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.979137,0.424578,0.615103,0.494784,0.469728
1,2,0.0,0.0,45,64,19,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.638767,0.342961,0.426320,0.376967,0.369029
2,3,0.0,0.0,53,96,43,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.066710,0.187362,0.422860,0.306671,0.307269
3,9,0.0,0.0,30,40,10,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.958828,0.407703,0.662541,0.489707,0.444292
4,11,0.0,0.0,40,45,5,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.346990,0.151011,0.524219,0.391165,0.416631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,0.0,0.0,44,55,11,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.968900,0.271113,0.595414,0.424129,0.404374
3259,10865,0.0,0.0,114,139,25,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,4.407768,0.140259,0.321931,0.220388,0.214111
3260,10868,0.0,0.0,25,55,30,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.990434,0.426605,0.551133,0.497608,0.506348
3261,10874,0.0,0.0,35,65,30,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.418856,0.324695,0.482028,0.403143,0.405775


## Guardado del dataframe

In [157]:
test_encoded.to_csv('test/test_encoded.csv', index=False)