# Imports

In [1]:
import pandas as pd
import numpy as np
from statistics import mean, median
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import category_encoders as ce #pip install category_encoders
import nltk #pip install nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import gensim
import gensim.downloader as gensim_api

[nltk_data] Downloading package punkt to /home/ben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Funciones

## keyword

In [2]:
def binary_encoder(col_name, col):
    
    ce_bin = ce.BinaryEncoder(cols = [col_name], drop_invariant=True)
    
    return ce_bin.fit_transform(col)

## location

## text

###  tf-idf

In [3]:
def tf_idf_encoder(col_text):
    
    tf_idf_vec = TfidfVectorizer(use_idf=True, smooth_idf=False, ngram_range=(1,1), tokenizer=nltk.word_tokenize)
    tf_idf_data = tf_idf_vec.fit_transform(col_text)
    tf_idf_array = tf_idf_data.toarray()
    vocabulary = tf_idf_vec.vocabulary_
    
    return tf_idf_array, vocabulary

In [4]:
def calculate_values(col, tf_idf_array, vocabulary):

    len_text = len(col)
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_text):
        values = []
        for word in col.iloc[i].split():
            if word in vocabulary:
                pos_word = vocabulary[word]
                values.append(tf_idf_array[i][pos_word])
            
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
        else:
            sum_values.append(0)
            min_values.append(0)
            max_values.append(0)
            mean_values.append(0)
            median_values.append(0)
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

### word2vec

In [5]:
w2v = gensim_api.load("word2vec-google-news-300")

In [6]:
def get_w2v_model(col):
    return gensim.models.word2vec.Word2Vec(col, size=300, window=8, min_count=1, iter=30)

In [7]:
def cos(x, y):
    return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [8]:
def calculate_values_w2v(col):
    
    len_col = len(col)
    sum_values = []; min_values = []; max_values = []; mean_values = []; median_values = []
    
    for i in range(len_col):
        words = col.iloc[i].split()
        array_values = []
        for word in words:
            if word in w2v:
                array_values.append(w2v[word])
        values = []
        len_array_values = len(array_values)
        for j in range(len_array_values):
            for k in range(j+1, len_array_values):
                values.append(cos(array_values[j], array_values[k]))
                
        if values:
            sum_values.append(sum(values))
            min_values.append(min(values))
            max_values.append(max(values))
            mean_values.append(mean(values))
            median_values.append(median(values))
        else:
            sum_values.append(0)
            min_values.append(0)
            max_values.append(0)
            mean_values.append(0)
            median_values.append(0)
            
    column_values = {'sum': sum_values, 'min': min_values, 'max': max_values,\
                     'mean': mean_values, 'median': median_values}
    return column_values

In [84]:
def calculate_len(col):
    
    len_string = []
    len_col = len(col)
    for i in range(len_col):
        if col.iloc[i] is not np.nan:
            len_string.append(len(col.iloc[i]))
        else:
            len_string.append(0)
        
    return len_string

# Train

In [107]:
train = pd.read_csv('train/train_limpio.csv')

## keyword

In [108]:
tf_idf_array, vocabulary = tf_idf_encoder(train['keyword'].astype('str'))

In [109]:
column_values = calculate_values(train['keyword'].astype('str'), tf_idf_array, vocabulary)

In [110]:
train['keyword_sum'] = [value for value in column_values['sum']]
train['keyword_min'] = [value for value in column_values['min']]
train['keyword_max'] = [value for value in column_values['max']]
train['keyword_mean'] = [value for value in column_values['mean']]
train['keyword_median'] = [value for value in column_values['median']]

In [111]:
len_keyword = calculate_len(train['keyword'])

pos_col_keyword = train.columns.get_loc('keyword')+1
train.insert(loc=pos_col_keyword, column='len_keyword', value=len_keyword)

In [112]:
train.head(10)

Unnamed: 0,keyword,len_keyword,location,text_con_stemming,text_sin_stemming,target,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median
0,,0,,deed reason earthquak may alah forgiv us al,our deeds are the reason of this earthquake ma...,1,1.0,1.0,1.0,1.0,1.0
1,,0,,forest fire near la rong sask canada,forest fire near la ronge sask canada,1,1.0,1.0,1.0,1.0,1.0
2,,0,,al resid ask shelter place notifi offic evacu ...,al residents asked to shelter in place are bei...,1,1.0,1.0,1.0,1.0,1.0
3,,0,,number peopl receiv wildfir evacu order califo...,number people receive wildfires evacuation o...,1,1.0,1.0,1.0,1.0,1.0
4,,0,,got sent photo rubi alaska smoke wildfir pour ...,just got sent this photo from ruby alaska as s...,1,1.0,1.0,1.0,1.0,1.0
5,,0,,rockyfir updat california hwi number close dir...,rockyfire update california hwy number clo...,1,1.0,1.0,1.0,1.0,1.0
6,,0,,flood disast heavi rain caus flash flood stree...,flood disaster heavy rain causes flash floodin...,1,1.0,1.0,1.0,1.0,1.0
7,,0,,top hil see fire wood,i am on top of the hil and i can see a fire in...,1,1.0,1.0,1.0,1.0,1.0
8,,0,,emerg evacu happen build across street,there is an emergency evacuation happening now...,1,1.0,1.0,1.0,1.0,1.0
9,,0,,afraid tornado come area,i am afraid that the tornado is coming to our ...,1,1.0,1.0,1.0,1.0,1.0


## location

In [113]:
tf_idf_array, vocabulary = tf_idf_encoder(train['location'].astype('str'))

In [114]:
column_values = calculate_values(train['location'].astype('str'), tf_idf_array, vocabulary)

In [115]:
train['location_sum'] = [value for value in column_values['sum']]
train['location_min'] = [value for value in column_values['min']]
train['location_max'] = [value for value in column_values['max']]
train['location_mean'] = [value for value in column_values['mean']]
train['location_median'] = [value for value in column_values['median']]

In [116]:
len_location = calculate_len(train['location'])

pos_col_location = train.columns.get_loc('location')+1
train.insert(loc=pos_col_location, column='len_location', value=len_location)

In [117]:
train.head(10)

Unnamed: 0,keyword,len_keyword,location,len_location,text_con_stemming,text_sin_stemming,target,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,location_sum,location_min,location_max,location_mean,location_median
0,,0,,0,deed reason earthquak may alah forgiv us al,our deeds are the reason of this earthquake ma...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,,0,,0,forest fire near la rong sask canada,forest fire near la ronge sask canada,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,,0,,0,al resid ask shelter place notifi offic evacu ...,al residents asked to shelter in place are bei...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,,0,,0,number peopl receiv wildfir evacu order califo...,number people receive wildfires evacuation o...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,,0,,0,got sent photo rubi alaska smoke wildfir pour ...,just got sent this photo from ruby alaska as s...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,,0,,0,rockyfir updat california hwi number close dir...,rockyfire update california hwy number clo...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,,0,,0,flood disast heavi rain caus flash flood stree...,flood disaster heavy rain causes flash floodin...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,,0,,0,top hil see fire wood,i am on top of the hil and i can see a fire in...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,,0,,0,emerg evacu happen build across street,there is an emergency evacuation happening now...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,,0,,0,afraid tornado come area,i am afraid that the tornado is coming to our ...,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## text

#### tf-idf

In [118]:
tf_idf_array, vocabulary = tf_idf_encoder(train['text_con_stemming'])

In [119]:
column_values = calculate_values(train['text_con_stemming'], tf_idf_array, vocabulary)

In [120]:
train['text_sum_tf-idf'] = [value for value in column_values['sum']]
train['text_min_tf-idf'] = [value for value in column_values['min']]
train['text_max_tf-idf'] = [value for value in column_values['max']]
train['text_mean_tf-idf'] = [value for value in column_values['mean']]
train['text_median_tf-idf'] = [value for value in column_values['median']]

In [121]:
train.head(10)

Unnamed: 0,keyword,len_keyword,location,len_location,text_con_stemming,text_sin_stemming,target,keyword_sum,keyword_min,keyword_max,...,location_sum,location_min,location_max,location_mean,location_median,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf
0,,0,,0,deed reason earthquak may alah forgiv us al,our deeds are the reason of this earthquake ma...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.745943,0.224894,0.474996,0.343243,0.323476
1,,0,,0,forest fire near la rong sask canada,forest fire near la ronge sask canada,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.545323,0.21165,0.508967,0.363618,0.340166
2,,0,,0,al resid ask shelter place notifi offic evacu ...,al residents asked to shelter in place are bei...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.92501,0.153758,0.585124,0.327084,0.256037
3,,0,,0,number peopl receiv wildfir evacu order califo...,number people receive wildfires evacuation o...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.515355,0.150085,0.574226,0.359336,0.346962
4,,0,,0,got sent photo rubi alaska smoke wildfir pour ...,just got sent this photo from ruby alaska as s...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.937988,0.247769,0.444024,0.326443,0.287533
5,,0,,0,rockyfir updat california hwi number close dir...,rockyfire update california hwy number clo...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.47217,0.094812,0.392155,0.26709,0.268381
6,,0,,0,flood disast heavi rain caus flash flood stree...,flood disaster heavy rain causes flash floodin...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.652263,0.211903,0.421501,0.304355,0.292368
7,,0,,0,top hil see fire wood,i am on top of the hil and i can see a fire in...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.1694,0.288043,0.580492,0.43388,0.410864
8,,0,,0,emerg evacu happen build across street,there is an emergency evacuation happening now...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.425554,0.344773,0.491582,0.404259,0.389401
9,,0,,0,afraid tornado come area,i am afraid that the tornado is coming to our ...,1,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.966376,0.391599,0.639305,0.491594,0.467736


#### word2vec

In [122]:
column_values = calculate_values_w2v(train['text_sin_stemming'])

In [123]:
train['text_sum_w2v'] = [value for value in column_values['sum']]
train['text_min_w2v'] = [value for value in column_values['min']]
train['text_max_w2v'] = [value for value in column_values['max']]
train['text_mean_w2v'] = [value for value in column_values['mean']]
train['text_median_w2v'] = [value for value in column_values['median']]

In [124]:
#train.drop(['keyword', 'location', 'text_sin_stemming', 'text_con_stemming'], axis=1, inplace=True)
train

Unnamed: 0,keyword,len_keyword,location,len_location,text_con_stemming,text_sin_stemming,target,keyword_sum,keyword_min,keyword_max,...,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v
0,,0,,0,deed reason earthquak may alah forgiv us al,our deeds are the reason of this earthquake ma...,1,1.0,1.0,1.0,...,2.745943,0.224894,0.474996,0.343243,0.323476,8.481658,0.000850,0.615265,0.154212,0.121727
1,,0,,0,forest fire near la rong sask canada,forest fire near la ronge sask canada,1,1.0,1.0,1.0,...,2.545323,0.211650,0.508967,0.363618,0.340166,0.790154,-0.103024,0.308946,0.079015,0.055061
2,,0,,0,al resid ask shelter place notifi offic evacu ...,al residents asked to shelter in place are bei...,1,1.0,1.0,1.0,...,3.925010,0.153758,0.585124,0.327084,0.256037,29.265612,-0.044882,1.000000,0.139360,0.095317
3,,0,,0,number peopl receiv wildfir evacu order califo...,number people receive wildfires evacuation o...,1,1.0,1.0,1.0,...,2.515355,0.150085,0.574226,0.359336,0.346962,2.733583,-0.039545,0.317247,0.097628,0.082558
4,,0,,0,got sent photo rubi alaska smoke wildfir pour ...,just got sent this photo from ruby alaska as s...,1,1.0,1.0,1.0,...,2.937988,0.247769,0.444024,0.326443,0.287533,13.153737,-0.041156,1.000000,0.125274,0.086994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,,0,,0,two giant crane hold bridg colaps nearbi home ...,two giant cranes holding a bridge colapse into...,1,1.0,1.0,1.0,...,2.884556,0.093327,0.417117,0.320506,0.352132,3.762588,-0.026625,0.312744,0.104516,0.090719
7609,,0,,0,ariahrari thetawniest control wild fire califo...,ariahrary thetawniest the out of control wild ...,1,1.0,1.0,1.0,...,3.228988,0.190353,0.425825,0.293544,0.279435,25.374036,-0.006511,1.000000,0.211450,0.158652
7610,,0,,0,number number number number utc number km volc...,m number number number number utc numbe...,1,1.0,1.0,1.0,...,4.771369,0.092557,0.627037,0.477137,0.527716,13.296097,-0.082743,1.000000,0.201456,0.046951
7611,,0,,0,polic investig e bike colid car littl portug e...,police investigating after an e bike colided w...,1,1.0,1.0,1.0,...,4.510709,0.157250,0.529638,0.265336,0.228318,22.747623,-0.091922,1.000000,0.119724,0.085732


In [125]:
cols = list(train.columns)
cols.remove('target')
cols.append('target')
train = train[cols]

### Guardado dataframe w2v

In [126]:
#train.to_csv('train/text_encoded_w2v.csv', index=False)

## len_text

In [127]:
len_text = calculate_len(train['text_con_stemming'])

pos_col_text = train.columns.get_loc('text_con_stemming')+1
train.insert(loc=pos_col_text, column='len_text', value=len_text)
train.head(10)

Unnamed: 0,keyword,len_keyword,location,len_location,text_con_stemming,len_text,text_sin_stemming,keyword_sum,keyword_min,keyword_max,...,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v,target
0,,0,,0,deed reason earthquak may alah forgiv us al,43,our deeds are the reason of this earthquake ma...,1.0,1.0,1.0,...,0.224894,0.474996,0.343243,0.323476,8.481658,0.00085,0.615265,0.154212,0.121727,1
1,,0,,0,forest fire near la rong sask canada,36,forest fire near la ronge sask canada,1.0,1.0,1.0,...,0.21165,0.508967,0.363618,0.340166,0.790154,-0.103024,0.308946,0.079015,0.055061,1
2,,0,,0,al resid ask shelter place notifi offic evacu ...,72,al residents asked to shelter in place are bei...,1.0,1.0,1.0,...,0.153758,0.585124,0.327084,0.256037,29.265612,-0.044882,1.0,0.13936,0.095317,1
3,,0,,0,number peopl receiv wildfir evacu order califo...,50,number people receive wildfires evacuation o...,1.0,1.0,1.0,...,0.150085,0.574226,0.359336,0.346962,2.733583,-0.039545,0.317247,0.097628,0.082558,1
4,,0,,0,got sent photo rubi alaska smoke wildfir pour ...,52,just got sent this photo from ruby alaska as s...,1.0,1.0,1.0,...,0.247769,0.444024,0.326443,0.287533,13.153737,-0.041156,1.0,0.125274,0.086994,1
5,,0,,0,rockyfir updat california hwi number close dir...,84,rockyfire update california hwy number clo...,1.0,1.0,1.0,...,0.094812,0.392155,0.26709,0.268381,7.810325,-0.093557,0.478663,0.100132,0.080587,1
6,,0,,0,flood disast heavi rain caus flash flood stree...,74,flood disaster heavy rain causes flash floodin...,1.0,1.0,1.0,...,0.211903,0.421501,0.304355,0.292368,9.341228,-0.047981,0.788929,0.141534,0.109943,1
7,,0,,0,top hil see fire wood,21,i am on top of the hil and i can see a fire in...,1.0,1.0,1.0,...,0.288043,0.580492,0.43388,0.410864,11.859967,-0.143466,1.0,0.152051,0.112546,1
8,,0,,0,emerg evacu happen build across street,38,there is an emergency evacuation happening now...,1.0,1.0,1.0,...,0.344773,0.491582,0.404259,0.389401,15.609978,-0.033696,1.0,0.200128,0.158737,1
9,,0,,0,afraid tornado come area,24,i am afraid that the tornado is coming to our ...,1.0,1.0,1.0,...,0.391599,0.639305,0.491594,0.467736,7.238668,0.004347,0.526257,0.160859,0.136866,1


## len_text_original

In [128]:
train_original = pd.read_csv('train/train_original.csv')
train_original.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [129]:
len_text = calculate_len(train_original['text'])

pos_col_text = train_original.columns.get_loc('text')+1
train_original.insert(loc=pos_col_text, column='len_text_original', value=len_text)
train_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,69,1
1,4,,,Forest fire near La Ronge Sask. Canada,38,1
2,5,,,All residents asked to 'shelter in place' are ...,133,1
3,6,,,"13,000 people receive #wildfires evacuation or...",65,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,88,1
5,8,,,#RockyFire Update => California Hwy. 20 closed...,110,1
6,10,,,#flood #disaster Heavy rain causes flash flood...,95,1
7,13,,,I'm on top of the hill and I can see a fire in...,59,1
8,14,,,There's an emergency evacuation happening now ...,79,1
9,15,,,I'm afraid that the tornado is coming to our a...,52,1


## Juntamos todo

In [130]:
cols = train.columns.tolist()
cols.remove('keyword')
cols.remove('location')
cols.remove('text_con_stemming')
cols.remove('text_sin_stemming')
cols.remove('target')
cols.append('target')
train = train[cols]

In [131]:
train

Unnamed: 0,len_keyword,len_location,len_text,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,location_sum,location_min,...,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v,target
0,0,0,43,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.224894,0.474996,0.343243,0.323476,8.481658,0.000850,0.615265,0.154212,0.121727,1
1,0,0,36,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.211650,0.508967,0.363618,0.340166,0.790154,-0.103024,0.308946,0.079015,0.055061,1
2,0,0,72,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.153758,0.585124,0.327084,0.256037,29.265612,-0.044882,1.000000,0.139360,0.095317,1
3,0,0,50,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.150085,0.574226,0.359336,0.346962,2.733583,-0.039545,0.317247,0.097628,0.082558,1
4,0,0,52,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.247769,0.444024,0.326443,0.287533,13.153737,-0.041156,1.000000,0.125274,0.086994,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,50,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.093327,0.417117,0.320506,0.352132,3.762588,-0.026625,0.312744,0.104516,0.090719,1
7609,0,0,82,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.190353,0.425825,0.293544,0.279435,25.374036,-0.006511,1.000000,0.211450,0.158652,1
7610,0,0,60,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.092557,0.627037,0.477137,0.527716,13.296097,-0.082743,1.000000,0.201456,0.046951,1
7611,0,0,96,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.157250,0.529638,0.265336,0.228318,22.747623,-0.091922,1.000000,0.119724,0.085732,1


In [132]:
train_encoded = train[cols]

pos_col_text = train_encoded.columns.get_loc('len_text')+1
train_encoded.insert(loc=pos_col_text, column='len_text_original', value=train_original['len_text_original'])

In [133]:
train_encoded

Unnamed: 0,len_keyword,len_location,len_text,len_text_original,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,location_sum,...,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v,target
0,0,0,43,69,1.0,1.0,1.0,1.0,1.0,1.0,...,0.224894,0.474996,0.343243,0.323476,8.481658,0.000850,0.615265,0.154212,0.121727,1
1,0,0,36,38,1.0,1.0,1.0,1.0,1.0,1.0,...,0.211650,0.508967,0.363618,0.340166,0.790154,-0.103024,0.308946,0.079015,0.055061,1
2,0,0,72,133,1.0,1.0,1.0,1.0,1.0,1.0,...,0.153758,0.585124,0.327084,0.256037,29.265612,-0.044882,1.000000,0.139360,0.095317,1
3,0,0,50,65,1.0,1.0,1.0,1.0,1.0,1.0,...,0.150085,0.574226,0.359336,0.346962,2.733583,-0.039545,0.317247,0.097628,0.082558,1
4,0,0,52,88,1.0,1.0,1.0,1.0,1.0,1.0,...,0.247769,0.444024,0.326443,0.287533,13.153737,-0.041156,1.000000,0.125274,0.086994,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,50,83,1.0,1.0,1.0,1.0,1.0,1.0,...,0.093327,0.417117,0.320506,0.352132,3.762588,-0.026625,0.312744,0.104516,0.090719,1
7609,0,0,82,125,1.0,1.0,1.0,1.0,1.0,1.0,...,0.190353,0.425825,0.293544,0.279435,25.374036,-0.006511,1.000000,0.211450,0.158652,1
7610,0,0,60,65,1.0,1.0,1.0,1.0,1.0,1.0,...,0.092557,0.627037,0.477137,0.527716,13.296097,-0.082743,1.000000,0.201456,0.046951,1
7611,0,0,96,137,1.0,1.0,1.0,1.0,1.0,1.0,...,0.157250,0.529638,0.265336,0.228318,22.747623,-0.091922,1.000000,0.119724,0.085732,1


In [134]:
train_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=train_encoded['len_text_original']-train_encoded['len_text'])

## Resultado

In [135]:
train_encoded

Unnamed: 0,len_keyword,len_location,len_text,len_text_original,diff_len_text,keyword_sum,keyword_min,keyword_max,keyword_mean,keyword_median,...,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v,target
0,0,0,43,69,26,1.0,1.0,1.0,1.0,1.0,...,0.224894,0.474996,0.343243,0.323476,8.481658,0.000850,0.615265,0.154212,0.121727,1
1,0,0,36,38,2,1.0,1.0,1.0,1.0,1.0,...,0.211650,0.508967,0.363618,0.340166,0.790154,-0.103024,0.308946,0.079015,0.055061,1
2,0,0,72,133,61,1.0,1.0,1.0,1.0,1.0,...,0.153758,0.585124,0.327084,0.256037,29.265612,-0.044882,1.000000,0.139360,0.095317,1
3,0,0,50,65,15,1.0,1.0,1.0,1.0,1.0,...,0.150085,0.574226,0.359336,0.346962,2.733583,-0.039545,0.317247,0.097628,0.082558,1
4,0,0,52,88,36,1.0,1.0,1.0,1.0,1.0,...,0.247769,0.444024,0.326443,0.287533,13.153737,-0.041156,1.000000,0.125274,0.086994,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0,0,50,83,33,1.0,1.0,1.0,1.0,1.0,...,0.093327,0.417117,0.320506,0.352132,3.762588,-0.026625,0.312744,0.104516,0.090719,1
7609,0,0,82,125,43,1.0,1.0,1.0,1.0,1.0,...,0.190353,0.425825,0.293544,0.279435,25.374036,-0.006511,1.000000,0.211450,0.158652,1
7610,0,0,60,65,5,1.0,1.0,1.0,1.0,1.0,...,0.092557,0.627037,0.477137,0.527716,13.296097,-0.082743,1.000000,0.201456,0.046951,1
7611,0,0,96,137,41,1.0,1.0,1.0,1.0,1.0,...,0.157250,0.529638,0.265336,0.228318,22.747623,-0.091922,1.000000,0.119724,0.085732,1


## Guardado del dataframe

In [136]:
train_encoded.to_csv('train/train_encoded.csv', index=False)

# Test

In [152]:
test = pd.read_csv('test/test_limpio.csv')

## keyword

In [153]:
tf_idf_array, vocabulary = tf_idf_encoder(test['keyword'].astype('str'))

In [154]:
column_values = calculate_values(test['keyword'].astype('str'), tf_idf_array, vocabulary)

In [155]:
test['keyword_sum'] = [value for value in column_values['sum']]
test['keyword_min'] = [value for value in column_values['min']]
test['keyword_max'] = [value for value in column_values['max']]
test['keyword_mean'] = [value for value in column_values['mean']]
test['keyword_median'] = [value for value in column_values['median']]

In [156]:
len_keyword = calculate_len(test['keyword'])

pos_col_keyword = test.columns.get_loc('keyword')+1
test.insert(loc=pos_col_keyword, column='len_keyword', value=len_keyword)

## location

In [157]:
tf_idf_array, vocabulary = tf_idf_encoder(test['location'].astype('str'))

In [158]:
column_values = calculate_values(test['location'].astype('str'), tf_idf_array, vocabulary)

In [159]:
test['location_sum'] = [value for value in column_values['sum']]
test['location_min'] = [value for value in column_values['min']]
test['location_max'] = [value for value in column_values['max']]
test['location_mean'] = [value for value in column_values['mean']]
test['location_median'] = [value for value in column_values['median']]

In [160]:
len_location = calculate_len(test['location'])

pos_col_location = test.columns.get_loc('location')+1
test.insert(loc=pos_col_location, column='len_location', value=len_location)

## text

#### tf-idf

In [164]:
test['text_con_stemming'] = test['text_con_stemming'].astype('str')
test['text_sin_stemming'] = test['text_sin_stemming'].astype('str')

In [165]:
tf_idf_array, vocabulary = tf_idf_encoder(test['text_con_stemming'])

In [166]:
column_values = calculate_values(test['text_con_stemming'], tf_idf_array, vocabulary)

In [167]:
test['text_sum_tf-idf'] = [value for value in column_values['sum']]
test['text_min_tf-idf'] = [value for value in column_values['min']]
test['text_max_tf-idf'] = [value for value in column_values['max']]
test['text_mean_tf-idf'] = [value for value in column_values['mean']]
test['text_median_tf-idf'] = [value for value in column_values['median']]

In [168]:
test

Unnamed: 0,id,keyword,len_keyword,location,len_location,text_con_stemming,text_sin_stemming,keyword_sum,keyword_min,keyword_max,...,location_sum,location_min,location_max,location_mean,location_median,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf
0,0,,0,,0,happen teribl car crash,just happened a terible car crash,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.979137,0.424578,0.615103,0.494784,0.469728
1,2,,0,,0,heard earthquak differ citi stay safe everyon,heard about earthquake is different cities sta...,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.638767,0.342961,0.426320,0.376967,0.369029
2,3,,0,,0,forest fire spot pond gees flee across street ...,there is a forest fire at spot pond geese are ...,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,3.066710,0.187362,0.422860,0.306671,0.307269
3,9,,0,,0,apocalyps light spokan wildfir,apocalypse lighting spokane wildfires,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.958828,0.407703,0.662541,0.489707,0.444292
4,11,,0,,0,typhoon soudelor kil number china taiwan,typhoon soudelor kils number in china and ta...,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.346990,0.151011,0.524219,0.391165,0.416631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,0,,0,earthquak safeti lo angel safeti fasten xrwn,earthquake safety los angeles safety fastene...,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.968900,0.271113,0.595414,0.424129,0.404374
3259,10865,,0,,0,storm ri wors last hurican cityand number othe...,storm in ri worse than last huricane my citya...,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,4.407768,0.140259,0.321931,0.220388,0.214111
3260,10868,,0,,0,green line derail chicago link,green line derailment in chicago link,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.101613,0.127398,0.546642,0.420323,0.475199
3261,10874,,0,,0,meg issu hazard weather outlook hwo link,meg issues hazardous weather outlook hwo link,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.501670,0.093384,0.479922,0.357381,0.351255


#### word2vec

In [169]:
column_values = calculate_values_w2v(test['text_sin_stemming'])

In [170]:
test['text_sum_w2v'] = [value for value in column_values['sum']]
test['text_min_w2v'] = [value for value in column_values['min']]
test['text_max_w2v'] = [value for value in column_values['max']]
test['text_mean_w2v'] = [value for value in column_values['mean']]
test['text_median_w2v'] = [value for value in column_values['median']]

## len_text

In [171]:
len_text = calculate_len(test['text_con_stemming'])

pos_col_text = test.columns.get_loc('text_con_stemming')+1
test.insert(loc=pos_col_text, column='len_text', value=len_text)
test.head(10)

Unnamed: 0,id,keyword,len_keyword,location,len_location,text_con_stemming,len_text,text_sin_stemming,keyword_sum,keyword_min,...,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v
0,0,,0,,0,happen teribl car crash,23,just happened a terible car crash,1.0,1.0,...,1.979137,0.424578,0.615103,0.494784,0.469728,2.338787,0.096782,0.416246,0.233879,0.217761
1,2,,0,,0,heard earthquak differ citi stay safe everyon,45,heard about earthquake is different cities sta...,1.0,1.0,...,2.638767,0.342961,0.42632,0.376967,0.369029,4.601476,0.020386,0.308944,0.127819,0.10381
2,3,,0,,0,forest fire spot pond gees flee across street ...,53,there is a forest fire at spot pond geese are ...,1.0,1.0,...,3.06671,0.187362,0.42286,0.306671,0.307269,16.283096,-0.060702,0.536185,0.106425,0.095932
3,9,,0,,0,apocalyps light spokan wildfir,30,apocalypse lighting spokane wildfires,1.0,1.0,...,1.958828,0.407703,0.662541,0.489707,0.444292,0.390538,0.065934,0.175728,0.130179,0.148877
4,11,,0,,0,typhoon soudelor kil number china taiwan,40,typhoon soudelor kils number in china and ta...,1.0,1.0,...,2.34699,0.151011,0.524219,0.391165,0.416631,1.067205,-0.01885,0.283668,0.071147,0.041079
5,12,,0,,0,shake earthquak,15,we are shaking it is an earthquake,1.0,1.0,...,1.387471,0.556874,0.830597,0.693736,0.693736,3.261046,-0.091945,0.536185,0.155288,0.093425
6,21,,0,,0,would probabl stil show life arsen yesterday e...,50,they would probably stil show more life than a...,1.0,1.0,...,3.254028,0.201604,0.700242,0.361559,0.282634,14.45337,-0.038854,1.0,0.1853,0.137811
7,22,,0,,0,hey,3,hey how are you,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.54229,0.097143,0.555028,0.257048,0.215696
8,27,,0,,0,nice hat,8,what a nice hat,1.0,1.0,...,1.413128,0.678867,0.734261,0.706564,0.706564,0.549909,0.086593,0.2738,0.183303,0.189516
9,29,,0,,0,fuck,4,fuck off,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.164787,0.164787,0.164787,0.164787,0.164787


## len_text_original

In [172]:
test_original = pd.read_csv('test/test_original.csv')
test_original.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


## Juntamos todo

In [173]:
test_encoded = test

len_text = calculate_len(test_original['text'])

pos_col_text = test_original.columns.get_loc('text')+1
test_original.insert(loc=pos_col_text, column='len_text_original', value=len_text)
test_original.head(10)

Unnamed: 0,id,keyword,location,text,len_text_original
0,0,,,Just happened a terrible car crash,34
1,2,,,"Heard about #earthquake is different cities, s...",64
2,3,,,"there is a forest fire at spot pond, geese are...",96
3,9,,,Apocalypse lighting. #Spokane #wildfires,40
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45
5,12,,,We're shaking...It's an earthquake,34
6,21,,,They'd probably still show more life than Arse...,72
7,22,,,Hey! How are you?,17
8,27,,,What a nice hat?,16
9,29,,,Fuck off!,9


In [174]:
pos_col_text = test_encoded.columns.get_loc('len_text')+1
test_encoded.insert(loc=pos_col_text, column='len_text_original', value=test_original['len_text_original'])

In [175]:
test_encoded

Unnamed: 0,id,keyword,len_keyword,location,len_location,text_con_stemming,len_text,len_text_original,text_sin_stemming,keyword_sum,...,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v
0,0,,0,,0,happen teribl car crash,23,34,just happened a terible car crash,1.0,...,1.979137,0.424578,0.615103,0.494784,0.469728,2.338787,0.096782,0.416246,0.233879,0.217761
1,2,,0,,0,heard earthquak differ citi stay safe everyon,45,64,heard about earthquake is different cities sta...,1.0,...,2.638767,0.342961,0.426320,0.376967,0.369029,4.601476,0.020386,0.308944,0.127819,0.103810
2,3,,0,,0,forest fire spot pond gees flee across street ...,53,96,there is a forest fire at spot pond geese are ...,1.0,...,3.066710,0.187362,0.422860,0.306671,0.307269,16.283096,-0.060702,0.536185,0.106425,0.095932
3,9,,0,,0,apocalyps light spokan wildfir,30,40,apocalypse lighting spokane wildfires,1.0,...,1.958828,0.407703,0.662541,0.489707,0.444292,0.390538,0.065934,0.175728,0.130179,0.148877
4,11,,0,,0,typhoon soudelor kil number china taiwan,40,45,typhoon soudelor kils number in china and ta...,1.0,...,2.346990,0.151011,0.524219,0.391165,0.416631,1.067205,-0.018850,0.283668,0.071147,0.041079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,,0,,0,earthquak safeti lo angel safeti fasten xrwn,44,55,earthquake safety los angeles safety fastene...,1.0,...,2.968900,0.271113,0.595414,0.424129,0.404374,2.451140,-0.074862,1.000000,0.163409,0.117514
3259,10865,,0,,0,storm ri wors last hurican cityand number othe...,114,139,storm in ri worse than last huricane my citya...,1.0,...,4.407768,0.140259,0.321931,0.220388,0.214111,35.725471,-0.054262,1.000000,0.129440,0.112207
3260,10868,,0,,0,green line derail chicago link,30,55,green line derailment in chicago link,1.0,...,2.101613,0.127398,0.546642,0.420323,0.475199,1.538411,0.009149,0.244232,0.102561,0.098677
3261,10874,,0,,0,meg issu hazard weather outlook hwo link,40,65,meg issues hazardous weather outlook hwo link,1.0,...,2.501670,0.093384,0.479922,0.357381,0.351255,1.739130,-0.046756,0.261727,0.082816,0.077447


In [176]:
test_encoded.insert(loc=pos_col_text+1, column='diff_len_text',\
                     value=test_encoded['len_text_original']-test_encoded['len_text'])

In [177]:
#test_encoded.fillna(0, inplace=True)

In [179]:
test_encoded.drop(['keyword', 'location', 'text_con_stemming', 'text_sin_stemming'], axis=1, inplace=True)

## Resultado

In [180]:
test_encoded

Unnamed: 0,id,len_keyword,len_location,len_text,len_text_original,diff_len_text,keyword_sum,keyword_min,keyword_max,keyword_mean,...,text_sum_tf-idf,text_min_tf-idf,text_max_tf-idf,text_mean_tf-idf,text_median_tf-idf,text_sum_w2v,text_min_w2v,text_max_w2v,text_mean_w2v,text_median_w2v
0,0,0,0,23,34,11,1.0,1.0,1.0,1.0,...,1.979137,0.424578,0.615103,0.494784,0.469728,2.338787,0.096782,0.416246,0.233879,0.217761
1,2,0,0,45,64,19,1.0,1.0,1.0,1.0,...,2.638767,0.342961,0.426320,0.376967,0.369029,4.601476,0.020386,0.308944,0.127819,0.103810
2,3,0,0,53,96,43,1.0,1.0,1.0,1.0,...,3.066710,0.187362,0.422860,0.306671,0.307269,16.283096,-0.060702,0.536185,0.106425,0.095932
3,9,0,0,30,40,10,1.0,1.0,1.0,1.0,...,1.958828,0.407703,0.662541,0.489707,0.444292,0.390538,0.065934,0.175728,0.130179,0.148877
4,11,0,0,40,45,5,1.0,1.0,1.0,1.0,...,2.346990,0.151011,0.524219,0.391165,0.416631,1.067205,-0.018850,0.283668,0.071147,0.041079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,10861,0,0,44,55,11,1.0,1.0,1.0,1.0,...,2.968900,0.271113,0.595414,0.424129,0.404374,2.451140,-0.074862,1.000000,0.163409,0.117514
3259,10865,0,0,114,139,25,1.0,1.0,1.0,1.0,...,4.407768,0.140259,0.321931,0.220388,0.214111,35.725471,-0.054262,1.000000,0.129440,0.112207
3260,10868,0,0,30,55,25,1.0,1.0,1.0,1.0,...,2.101613,0.127398,0.546642,0.420323,0.475199,1.538411,0.009149,0.244232,0.102561,0.098677
3261,10874,0,0,40,65,25,1.0,1.0,1.0,1.0,...,2.501670,0.093384,0.479922,0.357381,0.351255,1.739130,-0.046756,0.261727,0.082816,0.077447


## Guardado del dataframe

In [181]:
test_encoded.to_csv('test/test_encoded.csv', index=False)