# Imports

In [2]:
import pandas as pd
import numpy as np
from statistics import mean, median
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import category_encoders as ce #pip install category_encoders
import nltk #pip install nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import gensim
import gensim.downloader as gensim_api

[nltk_data] Downloading package punkt to /home/ben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Funciones

## keyword

In [3]:
def binary_encoder(col_name, col):
    
    ce_bin = ce.BinaryEncoder(cols = [col_name], drop_invariant=True)
    
    return ce_bin.fit_transform(col)

## location

In [18]:
nlp = gensim_api.load("word2vec-google-news-300")
def get_w2v_model(col):
    return gensim.models.word2vec.Word2Vec(col, size=300, window=8, min_count=1, sg=1, iter=30)




## text

In [4]:
def tf_idf_encoder(col_text):
    
    tf_idf_vec = TfidfVectorizer(use_idf=True, smooth_idf=False, ngram_range=(1,1), tokenizer=nltk.word_tokenize)
    tf_idf_data = tf_idf_vec.fit_transform(col_text)
    tf_idf_array = tf_idf_data.toarray()
    vocabulary = tf_idf_vec.vocabulary_
    
    return tf_idf_array, vocabulary

In [5]:
def calculate_values(df, tf_idf_array, vocabulary):

    len_text = len(df['text'])
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_text):
        values = []
        for word in df['text'].iloc[i].split():
            if word in vocabulary:
                pos_word = vocabulary[word]
                values.append(tf_idf_array[i][pos_word])
            
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

# Train

In [19]:
train = pd.read_csv('train/train_limpio.csv')

## keyword

In [20]:
keyword_encoded = binary_encoder('keyword', train['keyword'])

In [21]:
keyword_encoded

Unnamed: 0,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8
0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...
7608,0,0,0,0,0,0,0,1
7609,0,0,0,0,0,0,0,1
7610,0,0,0,0,0,0,0,1
7611,0,0,0,0,0,0,0,1


## location

In [42]:
nlp

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f7e373dc760>

In [6]:
location_encoded = get_w2v_model(train['location'].astype('str'))

In [14]:
location_encoded

<gensim.models.word2vec.Word2Vec at 0x7f7e45a3a5e0>

In [62]:
location_encoded = binary_encoder('location', train['location'])

In [63]:
location_encoded

Unnamed: 0,location_1,location_2,location_3,location_4,location_5,location_6,location_7,location_8,location_9,location_10,location_11,location_12
0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,0,0,0,0,0,0,0,0,0,1
7609,0,0,0,0,0,0,0,0,0,0,0,1
7610,0,0,0,0,0,0,0,0,0,0,0,1
7611,0,0,0,0,0,0,0,0,0,0,0,1


## text

#### tf-idf

In [22]:
tf_idf_array, vocabulary = tf_idf_encoder(train['text'])

In [23]:
column_values = calculate_values(train, tf_idf_array, vocabulary)

In [24]:
train['text_sum'] = [value for value in column_values['sum']]
train['text_min'] = [value for value in column_values['min']]
train['text_max'] = [value for value in column_values['max']]
train['text_mean'] = [value for value in column_values['mean']]
train['text_median'] = [value for value in column_values['median']]

In [25]:
train

Unnamed: 0,id,keyword,location,text,target,text_sum,text_min,text_max,text_mean,text_median
0,1,,,deed reason earthquak may alah forgiv us al,1,2.745943,0.224894,0.474996,0.343243,0.323476
1,4,,,forest fire near la rong sask canada,1,2.545323,0.211650,0.508967,0.363618,0.340166
2,5,,,al resid ask shelter place notifi offic evacu ...,1,3.925010,0.153758,0.585124,0.327084,0.256037
3,6,,,number peopl receiv wildfir evacu order califo...,1,2.515355,0.150085,0.574226,0.359336,0.346962
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,1,2.937988,0.247769,0.444024,0.326443,0.287533
...,...,...,...,...,...,...,...,...,...,...
7608,10869,,,two giant crane hold bridg colaps nearbi home,1,2.803465,0.284299,0.418945,0.350433,0.364511
7609,10870,,,ariahrari thetawniest control wild fire califo...,1,3.228988,0.190353,0.425825,0.293544,0.279435
7610,10871,,,number number number number utc number km volc...,1,4.698983,0.366487,0.629740,0.522109,0.629740
7611,10872,,,polic investig e bike colid car littl portug e...,1,4.510709,0.157250,0.529638,0.265336,0.228318


## len_text

In [26]:
pos_col_text = train.columns.get_loc('text')+1
train.insert(loc=pos_col_text, column='len_text', value=train['text'].iloc[:].str.len())
train.head(10)

Unnamed: 0,id,keyword,location,text,len_text,target,text_sum,text_min,text_max,text_mean,text_median
0,1,,,deed reason earthquak may alah forgiv us al,43,1,2.745943,0.224894,0.474996,0.343243,0.323476
1,4,,,forest fire near la rong sask canada,36,1,2.545323,0.21165,0.508967,0.363618,0.340166
2,5,,,al resid ask shelter place notifi offic evacu ...,72,1,3.92501,0.153758,0.585124,0.327084,0.256037
3,6,,,number peopl receiv wildfir evacu order califo...,50,1,2.515355,0.150085,0.574226,0.359336,0.346962
4,7,,,got sent photo rubi alaska smoke wildfir pour ...,52,1,2.937988,0.247769,0.444024,0.326443,0.287533
5,8,,,rockyfir updat california hwi number close dir...,84,1,3.47217,0.094812,0.392155,0.26709,0.268381
6,10,,,flood disast heavi rain caus flash flood stree...,74,1,3.652263,0.211903,0.421501,0.304355,0.292368
7,13,,,top hil see fire wood,21,1,2.1694,0.288043,0.580492,0.43388,0.410864
8,14,,,emerg evacu happen build across street,38,1,2.425554,0.344773,0.491582,0.404259,0.389401
9,15,,,afraid tornado come area,24,1,1.966376,0.391599,0.639305,0.491594,0.467736


## len_text_original

In [27]:
train_original = pd.read_csv('train/train_original.csv')
train_original.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [28]:
pos_col_text = train_original.columns.get_loc('text')+1
train_original.insert(loc=pos_col_text, column='len_text_original', value=train_original['text'].iloc[:].str.len())
train_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,69,1
1,4,,,Forest fire near La Ronge Sask. Canada,38,1
2,5,,,All residents asked to 'shelter in place' are ...,133,1
3,6,,,"13,000 people receive #wildfires evacuation or...",65,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,88,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,110,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,95,1
7,13,,,I'm on top of the hill and I can see a fire in...,59,1
8,14,,,There's an emergency evacuation happening now ...,79,1
9,15,,,I'm afraid that the tornado is coming to our a...,52,1


## join

In [35]:
aux = train[['id', 'len_text', 'text_sum', 'text_min', 'text_max', 'text_mean', 'text_median', 'target']]
train_encoded = keyword_encoded.join(other=aux, how='inner')

pos_col_text = train_encoded.columns.get_loc('len_text')+1
train_encoded.insert(loc=pos_col_text, column='len_text_original', value=train_original['len_text_original'])

In [36]:
train_encoded

Unnamed: 0,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8,id,len_text,len_text_original,text_sum,text_min,text_max,text_mean,text_median,target
0,0,0,0,0,0,0,0,1,1,43,69,2.745943,0.224894,0.474996,0.343243,0.323476,1
1,0,0,0,0,0,0,0,1,4,36,38,2.545323,0.211650,0.508967,0.363618,0.340166,1
2,0,0,0,0,0,0,0,1,5,72,133,3.925010,0.153758,0.585124,0.327084,0.256037,1
3,0,0,0,0,0,0,0,1,6,50,65,2.515355,0.150085,0.574226,0.359336,0.346962,1
4,0,0,0,0,0,0,0,1,7,52,88,2.937988,0.247769,0.444024,0.326443,0.287533,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,0,0,0,0,0,1,10869,45,83,2.803465,0.284299,0.418945,0.350433,0.364511,1
7609,0,0,0,0,0,0,0,1,10870,82,125,3.228988,0.190353,0.425825,0.293544,0.279435,1
7610,0,0,0,0,0,0,0,1,10871,55,65,4.698983,0.366487,0.629740,0.522109,0.629740,1
7611,0,0,0,0,0,0,0,1,10872,96,137,4.510709,0.157250,0.529638,0.265336,0.228318,1


In [37]:
train_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=train_encoded['len_text_original']-train_encoded['len_text'])

## Resultado

In [39]:
train_encoded

Unnamed: 0,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8,id,len_text,len_text_original,diff_len_text,text_sum,text_min,text_max,text_mean,text_median,target
0,0,0,0,0,0,0,0,1,1,43,69,26,2.745943,0.224894,0.474996,0.343243,0.323476,1
1,0,0,0,0,0,0,0,1,4,36,38,2,2.545323,0.211650,0.508967,0.363618,0.340166,1
2,0,0,0,0,0,0,0,1,5,72,133,61,3.925010,0.153758,0.585124,0.327084,0.256037,1
3,0,0,0,0,0,0,0,1,6,50,65,15,2.515355,0.150085,0.574226,0.359336,0.346962,1
4,0,0,0,0,0,0,0,1,7,52,88,36,2.937988,0.247769,0.444024,0.326443,0.287533,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,0,0,0,0,0,1,10869,45,83,38,2.803465,0.284299,0.418945,0.350433,0.364511,1
7609,0,0,0,0,0,0,0,1,10870,82,125,43,3.228988,0.190353,0.425825,0.293544,0.279435,1
7610,0,0,0,0,0,0,0,1,10871,55,65,10,4.698983,0.366487,0.629740,0.522109,0.629740,1
7611,0,0,0,0,0,0,0,1,10872,96,137,41,4.510709,0.157250,0.529638,0.265336,0.228318,1


## Guardado del dataframe

In [40]:
train_encoded.to_csv('train/train_encoded_sin_location.csv', index=False)

# Test

In [43]:
test = pd.read_csv('test/test_limpio.csv')

## keyword

In [44]:
keyword_encoded = binary_encoder('keyword', test['keyword'])

In [45]:
keyword_encoded

Unnamed: 0,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8
0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...
3258,0,0,0,0,0,0,0,1
3259,0,0,0,0,0,0,0,1
3260,0,0,0,0,0,0,0,1
3261,0,0,0,0,0,0,0,1


## location

In [72]:
location_encoded = binary_encoder('location', test['location'])

In [73]:
location_encoded

Unnamed: 0,location_1,location_2,location_3,location_4,location_5,location_6,location_7,location_8,location_9,location_10,location_11
0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
3258,0,0,0,0,0,0,0,0,0,0,1
3259,0,0,0,0,0,0,0,0,0,0,1
3260,0,0,0,0,0,0,0,0,0,0,1
3261,0,0,0,0,0,0,0,0,0,0,1


## text

In [46]:
test.text = test.text.astype('str')

In [61]:
test.text

0                                 happen teribl car crash
1           heard earthquak differ citi stay safe everyon
2       forest fire spot pond gees flee across street ...
3                          apocalyps light spokan wildfir
4                typhoon soudelor kil number china taiwan
                              ...                        
3258         earthquak safeti lo angel safeti fasten xrwn
3259    storm ri wors last hurican cityand number othe...
3260                            green line derail chicago
3261                  meg issu hazard weather outlook hwo
3262       cityofcalgari activ municip emerg plan ycstorm
Name: text, Length: 3263, dtype: object

In [47]:
tf_idf_array, vocabulary = tf_idf_encoder(test['text'])

In [48]:
column_values = calculate_values(test, tf_idf_array, vocabulary)

In [49]:
test['text_sum'] = [value for value in column_values['sum']]
test['text_min'] = [value for value in column_values['min']]
test['text_max'] = [value for value in column_values['max']]
test['text_mean'] = [value for value in column_values['mean']]
test['text_median'] = [value for value in column_values['median']]

In [50]:
test

Unnamed: 0,id,keyword,location,text,text_sum,text_min,text_max,text_mean,text_median
0,0,,,happen teribl car crash,1.979137,0.424578,0.615103,0.494784,0.469728
1,2,,,heard earthquak differ citi stay safe everyon,2.638767,0.342961,0.426320,0.376967,0.369029
2,3,,,forest fire spot pond gees flee across street ...,3.066710,0.187362,0.422860,0.306671,0.307269
3,9,,,apocalyps light spokan wildfir,1.958828,0.407703,0.662541,0.489707,0.444292
4,11,,,typhoon soudelor kil number china taiwan,2.346990,0.151011,0.524219,0.391165,0.416631
...,...,...,...,...,...,...,...,...,...
3258,10861,,,earthquak safeti lo angel safeti fasten xrwn,2.968900,0.271113,0.595414,0.424129,0.404374
3259,10865,,,storm ri wors last hurican cityand number othe...,4.407768,0.140259,0.321931,0.220388,0.214111
3260,10868,,,green line derail chicago,1.990434,0.426605,0.551133,0.497608,0.506348
3261,10874,,,meg issu hazard weather outlook hwo,2.418856,0.324695,0.482028,0.403143,0.405775


## len_text

In [51]:
pos_col_text = test.columns.get_loc('text')+1
test.insert(loc=pos_col_text, column='len_text', value=test['text'].iloc[:].str.len().astype('int64'))
test.head(10)

Unnamed: 0,id,keyword,location,text,len_text,text_sum,text_min,text_max,text_mean,text_median
0,0,,,happen teribl car crash,23,1.979137,0.424578,0.615103,0.494784,0.469728
1,2,,,heard earthquak differ citi stay safe everyon,45,2.638767,0.342961,0.42632,0.376967,0.369029
2,3,,,forest fire spot pond gees flee across street ...,53,3.06671,0.187362,0.42286,0.306671,0.307269
3,9,,,apocalyps light spokan wildfir,30,1.958828,0.407703,0.662541,0.489707,0.444292
4,11,,,typhoon soudelor kil number china taiwan,40,2.34699,0.151011,0.524219,0.391165,0.416631
5,12,,,shake earthquak,15,1.387471,0.556874,0.830597,0.693736,0.693736
6,21,,,would probabl stil show life arsen yesterday e...,50,3.254028,0.201604,0.700242,0.361559,0.282634
7,22,,,hey,3,1.0,1.0,1.0,1.0,1.0
8,27,,,nice hat,8,1.413128,0.678867,0.734261,0.706564,0.706564
9,29,,,fuck,4,1.0,1.0,1.0,1.0,1.0


## join

In [41]:
test_original = pd.read_csv('test/test_original.csv')
test_original.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [42]:
pos_col_text = test_original.columns.get_loc('text')+1
test_original.insert(loc=pos_col_text, column='len_text_original', value=test_original['text'].iloc[:].str.len())
test_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original
0,0,,,Just happened a terrible car crash,34
1,2,,,"Heard about #earthquake is different cities, s...",64
2,3,,,"there is a forest fire at spot pond, geese are...",96
3,9,,,Apocalypse lighting. #Spokane #wildfires,40
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45
5,12,,,We're shaking...It's an earthquake,34
6,21,,,They'd probably still show more life than Arse...,72
7,22,,,Hey! How are you?,17
8,27,,,What a nice hat?,16
9,29,,,Fuck off!,9


In [52]:
aux = train[['id', 'len_text', 'text_sum', 'text_min', 'text_max', 'text_mean', 'text_median']]
test_encoded = keyword_encoded.join(other=aux, how='inner')

pos_col_text = test_encoded.columns.get_loc('len_text')+1
test_encoded.insert(loc=pos_col_text, column='len_text_original', value=test_original['len_text_original'])

In [53]:
test_encoded

Unnamed: 0,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8,id,len_text,len_text_original,text_sum,text_min,text_max,text_mean,text_median
0,0,0,0,0,0,0,0,1,1,43,34,2.745943,0.224894,0.474996,0.343243,0.323476
1,0,0,0,0,0,0,0,1,4,36,64,2.545323,0.211650,0.508967,0.363618,0.340166
2,0,0,0,0,0,0,0,1,5,72,96,3.925010,0.153758,0.585124,0.327084,0.256037
3,0,0,0,0,0,0,0,1,6,50,40,2.515355,0.150085,0.574226,0.359336,0.346962
4,0,0,0,0,0,0,0,1,7,52,45,2.937988,0.247769,0.444024,0.326443,0.287533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,0,0,0,0,0,0,1,4680,35,55,2.634108,0.319564,0.421424,0.376301,0.382912
3259,0,0,0,0,0,0,0,1,4681,60,139,3.052516,0.186246,0.380151,0.305252,0.353636
3260,0,0,0,0,0,0,0,1,4682,53,55,2.916716,0.173461,0.417132,0.324080,0.312828
3261,0,0,0,0,0,0,0,1,4684,74,65,3.714004,0.216074,0.491475,0.309500,0.263199


In [54]:
test_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=test_encoded['len_text_original']-test_encoded['len_text'])

## Resultado

In [55]:
test_encoded

Unnamed: 0,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8,id,len_text,len_text_original,diff_len_text,text_sum,text_min,text_max,text_mean,text_median
0,0,0,0,0,0,0,0,1,1,43,34,-9,2.745943,0.224894,0.474996,0.343243,0.323476
1,0,0,0,0,0,0,0,1,4,36,64,28,2.545323,0.211650,0.508967,0.363618,0.340166
2,0,0,0,0,0,0,0,1,5,72,96,24,3.925010,0.153758,0.585124,0.327084,0.256037
3,0,0,0,0,0,0,0,1,6,50,40,-10,2.515355,0.150085,0.574226,0.359336,0.346962
4,0,0,0,0,0,0,0,1,7,52,45,-7,2.937988,0.247769,0.444024,0.326443,0.287533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,0,0,0,0,0,0,0,1,4680,35,55,20,2.634108,0.319564,0.421424,0.376301,0.382912
3259,0,0,0,0,0,0,0,1,4681,60,139,79,3.052516,0.186246,0.380151,0.305252,0.353636
3260,0,0,0,0,0,0,0,1,4682,53,55,2,2.916716,0.173461,0.417132,0.324080,0.312828
3261,0,0,0,0,0,0,0,1,4684,74,65,-9,3.714004,0.216074,0.491475,0.309500,0.263199


## Guardado del dataframe

In [56]:
test_encoded.to_csv('test/test_encoded_sin_location.csv', index=False)