In [30]:
import numpy as np
import pandas as pd
import string
#import xgboost as xgb
import io
import nltk
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')
stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

from textblob import TextBlob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
pd.set_option('mode.chained_assignment', None)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Datasets

In [31]:
tweets = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
tweets['keyword'] = tweets.keyword.str.replace('%20',' ')
test['keyword'] = test.keyword.str.replace('%20',' ')

In [32]:
tweets.drop_duplicates(subset = 'text', keep = False, inplace = True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7434 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7434 non-null   int64 
 1   keyword   7378 non-null   object
 2   location  4982 non-null   object
 3   text      7434 non-null   object
 4   target    7434 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 348.5+ KB


In [33]:
test['text'].duplicated().value_counts()

False    3243
True       20
Name: text, dtype: int64

### Extra Features

In [34]:
sia = SentimentIntensityAnalyzer()
def return_sia_compound_values(text):
    return sia.polarity_scores(text)['compound']

def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

def contains_punctuation(text):
    punctuation = set(string.punctuation)
    for character in text:
        if character in punctuation:
            return True
    return False

def amount_of_punctuation(text):
    punctuation = set(string.punctuation)
    amount = 0
    for character in text:
        if character in punctuation: amount += 1
    return amount

def get_adjectives(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("JJ")])

def get_nouns(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("NN")])

def get_verbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("VB")])

def get_adverbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("RB")])

In [35]:
def get_metrics(df):
    tweets_metrics = df[['id','text']]
    tweets_metrics['text_without_stopwords'] = tweets_metrics['text'].str.split()
    tweets_metrics['text_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(remove_stopword)
    tweets_metrics['length'] = tweets_metrics['text'].apply(lambda x: len(x))
    tweets_metrics['avg_word_length'] = tweets_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
    tweets_metrics['amount_of_words'] = tweets_metrics['text'].str.split().transform(lambda x: len(x))
    unique_words_by_tweet = tweets_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
    tweets_metrics['amount_of_unique_words'] = unique_words_by_tweet
    tweets_metrics['sentiment'] = tweets_metrics['text'].apply(lambda x: return_sia_compound_values(x))
    tweets_metrics['stopwords_count'] = tweets_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
    tweets_metrics['punctuation_count'] = tweets_metrics['text'].apply(lambda x: amount_of_punctuation(x))
    mentions = tweets_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
    tweets_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
    hashtags = tweets_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
    tweets_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
    tweets_metrics['longest_word_length_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
    tweets_metrics['stopword_word_ratio'] = tweets_metrics['stopwords_count'] / tweets_metrics['amount_of_words']
    tweets_metrics['adjectives_count'] = tweets_metrics['text'].apply(get_adjectives)
    tweets_metrics['nouns_count'] = tweets_metrics['text'].apply(get_nouns)
    tweets_metrics['verbs_count'] = tweets_metrics['text'].apply(get_verbs)
    tweets_metrics['adverbs_count'] = tweets_metrics['text'].apply(get_adverbs)
    return tweets_metrics

In [36]:
train_metrics = get_metrics(tweets)

In [37]:
test_metrics = get_metrics(test)

In [38]:
# Run for files
#train_metrics.to_csv('train_features.csv', index=False)
#test_metrics.to_csv('test_features.csv', index=False)

### Text preprocessing

In [39]:
# Mocks real tokenizer used in glove
import re

def tokenize_input(input_text):
    to_tokens = input_text[:]
    token_specification = [
        ('url', r'https?:\/\/\S+\b|www\.(\w+\.)+\S*'),
        (' / ', r'/'),
        ('user', r'@\w+'),            
        ('smile', r'[8:=;][)d]+|[)d]+[\'`\-]?[8:=;]'),    
        ('lolface', r'[8:=;][\'`\-]?p'),      
        ('sadface', r'[8:=;][\'`\-]?\(|\)+[8:=;][\'`\-]?'),          
        ('neutralface', r'[8:=;][\'`\-]?[\/|l*]'),       
        ('heart', r'<3'),   
        ('number', r'[-+]?[.\d]*[\d]+[:,.\d]*')
    ]
    for replacement, regex in token_specification:
        to_tokens = re.sub(regex, replacement, to_tokens)
    return to_tokens

In [40]:
# Check
tokenize_input('https://regexr.com hola / :) <3 :p :(  8888 @justin')

'url hola  /  smile heart lolface sadface  number user'

In [41]:
import string

tweets['text'] = tweets['text'].apply(tokenize_input)
test['text'] = test['text'].apply(tokenize_input)
tweets['text'] = tweets['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
test['text'] = test['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
tweets['text'] = tweets['text'].apply(lambda x: x.lower())
test['text'] = test['text'].apply(lambda x: x.lower())

In [42]:
# Run for files
#tweets.to_csv('processed_train.csv', index=False)
#test.to_csv('processed_test.csv', index=False)

### Vector encoding

#### Keywords

In [43]:
tweets['keyword'] = tweets['keyword'].fillna('null')
test['keyword'] = test['keyword'].fillna('null')

In [44]:
keyword_tokens = tweets.keyword.unique().tolist()
keyword_test = test.keyword.unique().tolist()

In [45]:
for k in keyword_test:
    if k not in keyword_tokens:
        print(k) # Mismas palabras en ambos sets

In [46]:
embeddings_index = dict()
f = open('Embeddings/glove.6B.100d.txt', encoding='utf8') # Vectores entrenados de 100 dimensiones
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [47]:
vectors = {}
for keyword in keyword_tokens:
    words = keyword.split(' ')
    n = len(words)
    if n == 1:
        vectors[keyword] = embeddings_index[keyword]
    else:
        acum = np.zeros(100)
        for w in words:
            acum = np.sum([acum,embeddings_index[w]] , axis=0)
        vectors[keyword] = acum

In [69]:
keyword_vectors = pd.DataFrame.from_dict(vectors).T.reset_index()
keyword_vectors.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,,0.079432,-0.14054,-0.10462,-0.36259,-0.22721,-0.13612,0.74755,0.32809,0.54364,...,-1.2123,0.51573,0.16573,0.67943,0.35327,0.17672,0.25803,0.068445,-1.2016,-0.20168
1,ablaze,0.13701,-0.31349,-0.047427,-0.24582,0.76459,1.2133,0.25674,0.44616,1.2104,...,1.4278,-0.40205,-0.26682,-0.029039,-1.1023,0.20442,-0.064528,0.30504,0.4283,0.54531
2,accident,-0.006329,-0.37913,0.40992,-0.003844,-0.81139,-0.6784,0.25995,1.0903,0.60039,...,0.49451,-0.3087,-0.1855,0.71409,0.19886,1.1276,-0.10096,-0.1,0.21349,-1.2453
3,aftershock,0.13692,1.0257,0.53961,0.27531,-0.91579,0.24287,0.77162,0.025242,0.47416,...,1.236,0.12651,-0.93994,0.18741,0.71254,0.79876,-0.040149,-0.59122,-0.28051,-0.23293
4,airplane accident,-0.175379,-0.10526,0.97786,-0.001856,-0.93397,-1.15404,0.7957,1.143708,0.94012,...,0.78136,-0.287935,0.23436,0.676289,0.262775,1.53821,-0.30071,-0.089367,0.77883,-2.21178


In [49]:
aux = []
aux.append('keyword')
for i in range (0, 100):
    name = 'k' + str(i)
    aux.append(name)

In [129]:
keyword_vectors.columns = aux
keyword_vectors.head()

Unnamed: 0,location,l0,l1,l2,l3,l4,l5,l6,l7,l8,...,l90,l91,l92,l93,l94,l95,l96,l97,l98,l99
0,,0.079432,-0.14054,-0.10462,-0.36259,-0.22721,-0.13612,0.74755,0.32809,0.54364,...,-1.2123,0.51573,0.16573,0.67943,0.35327,0.17672,0.25803,0.068445,-1.2016,-0.20168
1,ablaze,0.13701,-0.31349,-0.047427,-0.24582,0.76459,1.2133,0.25674,0.44616,1.2104,...,1.4278,-0.40205,-0.26682,-0.029039,-1.1023,0.20442,-0.064528,0.30504,0.4283,0.54531
2,accident,-0.006329,-0.37913,0.40992,-0.003844,-0.81139,-0.6784,0.25995,1.0903,0.60039,...,0.49451,-0.3087,-0.1855,0.71409,0.19886,1.1276,-0.10096,-0.1,0.21349,-1.2453
3,aftershock,0.13692,1.0257,0.53961,0.27531,-0.91579,0.24287,0.77162,0.025242,0.47416,...,1.236,0.12651,-0.93994,0.18741,0.71254,0.79876,-0.040149,-0.59122,-0.28051,-0.23293
4,airplane accident,-0.175379,-0.10526,0.97786,-0.001856,-0.93397,-1.15404,0.7957,1.143708,0.94012,...,0.78136,-0.287935,0.23436,0.676289,0.262775,1.53821,-0.30071,-0.089367,0.77883,-2.21178


In [None]:
keywords_to_merge = tweets.merge(keyword_vectors, how='left').drop(columns=['keyword', 'location', 'text', 'target'])
keywords_to_merge

In [None]:
keywords_merge_test = test.merge(keyword_vectors, how='left').drop(columns=['keyword', 'location', 'text'])
keywords_merge_test

In [54]:
# Run for files
#keywords_to_merge.to_csv('keyword_features.csv', index=False)
keywords_merge_test.to_csv('keyword_test_features.csv', index=False)

### Keywords one hot encoding

In [88]:
tweets

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7604,10863,,,#WorldNews Fallen powerlines on G:link tram: U...,1
7605,10864,,,on the flip side I'm at Walmart and there is a...,1
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1
7608,10869,,,Two giant cranes holding a bridge collapse int...,1


In [89]:
disaster_list = list(tweets['keyword'].unique())

In [90]:
crash = ['collide', 'collided', 'collision', 'crash', 'crashed', 'wreck', 'wreckage', 'wrecked']

emergency = ['emergency', 'emergency plan']

electricity = ['electrocute', 'electrocuted',]

helpers = ['ambulance', 'police', 'siren', 'sirens', 'emergency services', 'first responders',\
           'stretcher', 'eyewitness', 'rescuers']

panic = ['screamed', 'screaming', 'screams', 'panic', 'mayhem', 'riot', 'rioting', 'fear', 'panicking', 'trauma',\
         'trouble', 'hail', 'pandemonium']

hostages = ['hostage', 'hostages', 'trapped']

quarentine = ['quarantine', 'quarantined']

colapse = ['bridge collapse', 'collapse', 'collapsed', 'demolish', 'demolished', 'demolition', 'structural failure']

accident = ['accident', 'airplane accident', 'derail', 'derailed', 'derailment', 'oil spill']

fire = ['ablaze', 'hellfire', 'smoke', 'wild fires', 'wildfire', 'buildings burning',\
        'buildings on fire', 'burned', 'burning', 'burning buildings', 'bush fires', 'fire',\
        'fire truck', 'flames', 'forest fire', 'forest fires', 'blaze', 'blazing', 'arson', 'arsonist']

nuclear = ['nuclear disaster', 'nuclear reactor', 'radiation emergency', 'meltdown']

explotion = ['explode', 'exploded', 'explosion', 'blown up', 'blew up', 'loud bang']

survivor = ['survive', 'survived', 'rescue', 'rescued', 'survivors', 'evacuate', 'evacuated', 'evacuation', 'refugees']

wounded = ['wounded', 'wounds', 'bleeding', 'bloody', 'injured', 'injuries', 'injury', 'traumatised', 'blood']

bomb = ['suicide bomb', 'suicide bomber', 'suicide bombing', 'bomb', 'bombed', 'bombing', 'detonate', 'detonation']

storm = ['storm', 'thunderstorm', 'thunder', 'rainstorm', 'violent storm', 'windstorm', 'lightning', 'hailstorm']

water = ['flood', 'flooding', 'floods', 'inundated', 'inundation', 'sinking', 'drown', 'drowned', 'drowning', 'sunk']

natural_disaster = ['heat wave','sandstorm', 'seismic' ,'avalanche', 'tsunami', 'twister',\
                    'typhoon',  'tornado', 'hurricane', 'natural disaster', 'cyclone', 'volcano',\
                    'drought', 'dust storm', 'earthquake',  'lava', 'aftershock', 'snowstorm', 'blizzard',\
                    'whirlwind', 'upheaval',  'landslide', 'cliff fall', 'mudslide', 'sinkhole', 'displaced',\
                    'epicentre']

attack = ['attack', 'attacked']

casualties = ['mass murder', 'mass murderer', 'massacre', 'fatal', 'fatalities', 'fatality', 'casualties',\
              'casualty', 'body bag', 'body bagging', 'body bags', 'dead', 'death', 'deaths',  'tragedy']

terrorism = ['terrorism', 'terrorist', 'threat', 'hijack', 'hijacker', 'hijacking', 'bioterror', 'bioterrorism']

destruction = ['destroyed', 'destruction', 'devastated',\
               'devastation', 'disaster', 'annihilated', 'annihilation', 'apocalypse',\
               'armageddon', 'catastrophe', 'catastrophic', 'obliterate', 'obliterated',\
               'obliteration', 'damage', 'destroy', 'desolate', 'desolation', 'blight',\
               'harm', 'hazard', 'hazardous', 'danger', 'ruin', 'engulfed', 'rubble', 'debris',\
               'razed', 'flattened', 'crush', 'crushed']

warlike = ['war zone', 'weapon', 'weapons', 'military', 'army', 'battle', 'outbreak', 'chemical emergency', 'curfew']

starvation = ['famine', 'deluge', 'deluged']


null = ['null']

In [91]:
disasters = {'crash' : crash, 'emergency' : emergency, 'electricity': electricity,\
             'helpers': helpers, 'panic' : panic,  'hostages' : hostages, 'quarentine' : quarentine,\
             'colapse' : colapse, 'accident' : accident, 'fire' : fire,\
             'nuclear' : nuclear, 'explotion' : explotion, 'survivor' : survivor,\
             'wounded' : wounded, 'bomb' : bomb, 'storm' : storm,\
             'water' : water, 'natural_disaster' : natural_disaster, 'attack' : attack,\
             'casualties' : casualties, 'terrorism' : terrorism, 'destruction' : destruction,\
             'warlike' : warlike, 'starvation' : starvation, 'null' : null}

In [92]:
n_categories = len(disasters)
n_categories

25

In [94]:
numeric_maps = {}
count = 0
for k in disasters.keys():
    numeric_maps[k] = count
    count += 1

In [95]:
mapping = {}
for k, v in disasters.items():
    for w in v:
        mapping[w] = k

In [96]:
mapping_df = pd.DataFrame.from_dict(mapping, orient='index').reset_index()
mapping_df.columns = ['keyword', 'group']
numeric_maps_df = pd.DataFrame.from_dict(numeric_maps, orient='index').reset_index()
numeric_maps_df.columns = ['group', 'id']
to_encode = mapping_df.merge(numeric_maps_df)

In [97]:
to_encode

Unnamed: 0,keyword,group,id
0,collide,crash,0
1,collided,crash,0
2,collision,crash,0
3,crash,crash,0
4,crashed,crash,0
...,...,...,...
217,curfew,warlike,22
218,famine,starvation,23
219,deluge,starvation,23
220,deluged,starvation,23


In [98]:
from sklearn.preprocessing import LabelBinarizer

lb = preprocessing.LabelBinarizer()
lb.fit(range(0, n_categories))

LabelBinarizer()

In [99]:
lb.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24])

In [100]:
to_encode['encode'] = list(lb.transform(to_encode['id']))

In [101]:
expanded = to_encode.encode.apply(pd.Series)

In [102]:
result = pd.concat([to_encode, expanded], axis=1, sort=False)

In [103]:
result.drop(columns=['encode', 'id'], inplace=True)

In [104]:
result.head()

Unnamed: 0,keyword,group,0,1,2,3,4,5,6,7,...,15,16,17,18,19,20,21,22,23,24
0,collide,crash,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,collided,crash,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,collision,crash,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,crash,crash,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,crashed,crash,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
keyword_exp = tweets.loc[:, ['id', 'keyword']]
keyword_exp_test = test.loc[:, ['id', 'keyword']]

In [108]:
merged_train = keyword_exp.merge(result, left_on='keyword', right_on='keyword', how='left')
merged_test = keyword_exp_test.merge(result, left_on='keyword', right_on='keyword', how='left')

In [109]:
merged_train.drop(columns=['keyword', 'group'], inplace=True)
merged_test.drop(columns=['keyword', 'group'], inplace=True)

In [110]:
# Run for file
#merged_train.to_csv('keyword_mapping.csv', index=False)
#merged_test.to_csv('keyword_mapping_test.csv', index=False)

### Location Load

In [55]:
locations = pd.read_csv("../TP1/locations.csv", usecols=['location', 'address', 'point'])
locations.head()

Unnamed: 0,location,address,point
0,,,
1,glasgow,"Glasgow, Glasgow City, Scotland, G2 9SA, Unite...","(55.8609825, -4.2488787, 0.0)"
2,"melbourne, australia","City of Melbourne, Victoria, Australia","(-37.8142176, 144.9631608, 0.0)"
3,news,"34375, Abbotsford Centre, Abbotsford, Fraser V...","(49.04172215, -122.27255349013137, 0.0)"
4,alberta,"Alberta, Canada","(55.001251, -115.002136, 0.0)"


In [56]:
locations.fillna('null', inplace=True)
empty_loc = locations.loc[0, 'location']
locations.replace(empty_loc, 'null', inplace=True)
locations.head()

Unnamed: 0,location,address,point
0,,,
1,glasgow,"Glasgow, Glasgow City, Scotland, G2 9SA, Unite...","(55.8609825, -4.2488787, 0.0)"
2,"melbourne, australia","City of Melbourne, Victoria, Australia","(-37.8142176, 144.9631608, 0.0)"
3,news,"34375, Abbotsford Centre, Abbotsford, Fraser V...","(49.04172215, -122.27255349013137, 0.0)"
4,alberta,"Alberta, Canada","(55.001251, -115.002136, 0.0)"


### Coordinates X,Y

In [57]:
tweets['location'] = tweets['location'].fillna('null')
test['location'] = test['location'].fillna('null')

In [58]:
def point_to_list(point):
    if point == 'null':
        return [300.0, 300.0] # Arbitrary large number 
    
    coordinates = []
    aux = point[:]
    row = aux.strip( '()' ).split(',')
    coordinates.append(float(row[0]))
    coordinates.append(float(row[1]))
    return coordinates

In [59]:
locations['point'] = locations.point.apply(point_to_list)

In [60]:
aux = locations.point.apply(pd.Series)
aux.columns = ['x', 'y']

In [61]:
locations['x'] = aux['x']
locations['y'] = aux['y']

In [63]:
locations

Unnamed: 0,location,address,point,x,y
0,,,"[300.0, 300.0]",300.000000,300.000000
1,glasgow,"Glasgow, Glasgow City, Scotland, G2 9SA, Unite...","[55.8609825, -4.2488787]",55.860982,-4.248879
2,"melbourne, australia","City of Melbourne, Victoria, Australia","[-37.8142176, 144.9631608]",-37.814218,144.963161
3,news,"34375, Abbotsford Centre, Abbotsford, Fraser V...","[49.04172215, -122.27255349013137]",49.041722,-122.272553
4,alberta,"Alberta, Canada","[55.001251, -115.002136]",55.001251,-115.002136
...,...,...,...,...,...
2266,zac newsome loves me,,"[300.0, 300.0]",300.000000,300.000000
2267,"zeerust, south africa","Zeerust, Ngaka Modiri Molema District Municipa...","[-25.537731, 26.074382]",-25.537731,26.074382
2268,zero branco,"Zero Branco, Treviso, Veneto, 31059, Italia","[45.601701, 12.165212]",45.601701,12.165212
2269,ziam af,,"[300.0, 300.0]",300.000000,300.000000


In [64]:
tweets['location'] = tweets['location'].apply(str.lower)
test['location'] = test['location'].apply(str.lower)

In [65]:
coordinates_train = tweets.merge(locations.loc[:, ['location', 'x', 'y']], left_on='location', right_on='location', how='left').loc[:, ['id', 'x', 'y']]
coordinates_test = test.merge(locations.loc[:, ['location', 'x', 'y']], left_on='location', right_on='location', how='left').loc[:, ['id', 'x', 'y']]

In [101]:
# Run for file
coordinates_train.to_csv('coordinates_train.csv', index=False)
coordinates_test.to_csv('coordinates_test.csv', index=False)

### Coordinates, vectors from words.

In [102]:
locations.head()

Unnamed: 0,location,address,point,x,y
0,,,"[300.0, 300.0]",300.0,300.0
1,glasgow,"Glasgow, Glasgow City, Scotland, G2 9SA, Unite...","[55.8609825, -4.2488787]",55.860982,-4.248879
2,"melbourne, australia","City of Melbourne, Victoria, Australia","[-37.8142176, 144.9631608]",-37.814218,144.963161
3,news,"34375, Abbotsford Centre, Abbotsford, Fraser V...","[49.04172215, -122.27255349013137]",49.041722,-122.272553
4,alberta,"Alberta, Canada","[55.001251, -115.002136]",55.001251,-115.002136


In [117]:
tweets['location'] = tweets['location'].fillna('null')
test['location'] = test['location'].fillna('null')
tweets['location'] = tweets['location'].str.replace(',','')
test['location'] = test['location'].str.replace(',','')

In [76]:
location_tokens = tweets.location.unique().tolist()
location_test = test.location.unique().tolist()

In [None]:
for k in location_test:
    if k not in location_tokens:
        print(k) # Mismas palabras en ambos sets, jaja no.

In [78]:
embeddings_index = dict()
f = open('Embeddings/glove.6B.100d.txt', encoding='utf8') # Vectores entrenados de 100 dimensiones
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [82]:
vectors = {}
for location in location_tokens:
    words = location.split(' ')
    n = len(words)
    try:
        if n == 1:
            vectors[location] = embeddings_index[location]
        else:
            acum = np.zeros(100)
            for w in words:
                acum = np.sum([acum,embeddings_index[w]] , axis=0)
    except KeyError:
        continue
        vectors[location] = acum

In [95]:
location_vectors = pd.DataFrame.from_dict(vectors).T.reset_index()
location_vectors.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,,0.079432,-0.14054,-0.10462,-0.36259,-0.22721,-0.13612,0.74755,0.32809,0.54364,...,-1.2123,0.51573,0.16573,0.67943,0.35327,0.17672,0.25803,0.068445,-1.2016,-0.20168
1,birmingham,0.47334,0.2881,-0.40206,-0.76942,0.50565,0.56655,0.15756,0.28222,-0.63076,...,-0.45335,-0.58894,0.10209,1.4372,-0.39642,0.050214,0.57303,0.74257,0.65308,-0.56826
2,africa,-0.28641,0.8405,1.1178,0.18766,0.073111,-0.24262,0.12002,0.9052,-0.77801,...,0.1855,-0.019433,0.64313,-0.22149,-0.37251,0.58641,-0.80282,-0.22708,0.29665,0.20128
3,pretoria,-0.10046,-0.44136,0.66512,0.27777,-0.59921,0.29228,0.1871,1.2412,-0.70815,...,0.51562,-0.4611,-0.13979,0.35418,-0.058125,0.6647,-0.38143,0.084048,0.42608,0.4004
4,india,-0.95967,0.30795,0.90052,1.0364,0.003491,-0.80758,-1.139,0.81109,-0.67857,...,0.25884,-0.19463,-0.27582,-0.70492,-0.69454,0.70624,0.2283,0.081052,0.1351,0.14388


In [99]:
aux = []
aux.append('location')
for i in range (0, 100):
    name = 'l' + str(i)
    aux.append(name)

In [None]:
location_vectors.columns = aux
location_vectors.head()

In [None]:
locations_to_merge = tweets.merge(location_vectors, on='location', how = 'outer')#.drop(columns=['keyword', 'location', 'text', 'target'])
locations_to_merge

In [None]:
locations_test_merge = test.merge(location_vectors, how='left').drop(columns=['keyword', 'location', 'text'])

In [108]:
locations_to_merge.to_csv('locations_vectors_train.csv', index = False)
locations_test_merge.to_csv('locations_vectors_test.csv', index = False)

### Merge all things

In [501]:
train_w_features = pd.read_csv('../TP2/train_features.csv')
test_w_features = pd.read_csv('../TP2/test_features.csv')
keyword_features = pd.read_csv('../TP2/keyword_features.csv')
keyword_test_features = pd.read_csv('../TP2/keyword_test_features.csv')
train_processed_text = pd.read_csv('../TP2/processed_train.csv')
test_processed_text = pd.read_csv('../TP2/processed_test.csv')
location_train_xy = pd.read_csv('../TP2/coordinates_train.csv')
location_test_xy = pd.read_csv('../TP2/coordinates_test.csv')
location_train_vectors = pd.read_csv('../TP2/locations_vectors_train.csv')
location_test_vectors = pd.read_csv('../TP2/locations_vectors_test.csv')

train_w_features.insert(3,'target',train_processed_text['target'])
train_features_and_kw = train_w_features.merge(keyword_features, on='id')
train_features_and_kw.insert(3,'processed_text', train_processed_text['text'])

test_features_and_kw = test_w_features.merge(keyword_test_features, on='id')
test_features_and_kw.insert(3,'processed_text', test_processed_text['text'])

train_features_and_kw['text_without_stopwords'] = train_features_and_kw['text_without_stopwords'].fillna('')
test_features_and_kw['text_without_stopwords'] = test_features_and_kw['text_without_stopwords'].fillna('')

locations_train = location_train_xy.merge(location_train_vectors, on = 'id')
locations_test = location_test_xy.merge(location_test_vectors, on = 'id')

train_complete = train_features_and_kw.merge(locations_train, on = 'id')
test_complete = test_features_and_kw.merge(locations_test, on = 'id')

In [505]:
train_complete['x'] = train_complete['x'].fillna(300.0)
test_complete['x'] = test_complete['x'].fillna(300.0)
train_complete['y'] = train_complete['y'].fillna(300.0)
test_complete['y'] = test_complete['y'].fillna(300.0)

In [526]:
i = 0
loc_values = locations_train.loc[0,'l0':].to_list()
for column in train_complete.loc[:,'l0':].columns:
    train_complete[column] = train_complete[column].fillna(loc_values[i])
    i+=1

In [530]:
j = 0
loc_values2 = locations_test.loc[0,'l0':].to_list()
for column in test_complete.loc[:,'l0':].columns:
    test_complete[column] = test_complete[column].fillna(loc_values2[j])
    j+=1

In [None]:
train_complete.to_csv('train_complete.csv', index = False)
test_complete.to_csv('test_complete.csv', index = False)