In [7]:
import numpy as np
import pandas as pd
import string
#import xgboost as xgb
import io
import nltk
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
nltk.download('stopwords')
stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')

from textblob import TextBlob
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
pd.set_option('mode.chained_assignment', None)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mausa\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
sia = SentimentIntensityAnalyzer()
def return_sia_compound_values(text):
    return sia.polarity_scores(text)['compound']

def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

def contains_punctuation(text):
    punctuation = set(string.punctuation)
    for character in text:
        if character in punctuation:
            return True
    return False

def amount_of_punctuation(text):
    punctuation = set(string.punctuation)
    amount = 0
    for character in text:
        if character in punctuation: amount += 1
    return amount

def get_adjectives(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("JJ")])

def get_nouns(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("NN")])

def get_verbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("VB")])

def get_adverbs(text):
    blob = TextBlob(text)
    return len([word for (word,tag) in blob.tags if tag.startswith("RB")])

In [3]:
tweets = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
tweets['keyword'] = tweets.keyword.str.replace('%20',' ')

In [4]:
tweets.drop_duplicates(subset = 'text', keep = False, inplace = True)
tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7434 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7434 non-null   int64 
 1   keyword   7378 non-null   object
 2   location  4982 non-null   object
 3   text      7434 non-null   object
 4   target    7434 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 348.5+ KB


In [5]:
test['text'].duplicated().value_counts()

False    3243
True       20
Name: text, dtype: int64

### Adición de features de texto.

In [None]:
tweets_metrics = tweets[['id','text','target']]
tweets_metrics['text_without_stopwords'] = tweets_metrics['text'].str.split()
tweets_metrics['text_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(remove_stopword)

tweets_metrics['length'] = tweets_metrics['text'].apply(lambda x: len(x))
tweets_metrics['avg_word_length'] = tweets_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
tweets_metrics['amount_of_words'] = tweets_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = tweets_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
tweets_metrics['amount_of_unique_words'] = unique_words_by_tweet
tweets_metrics['sentiment'] = tweets_metrics['text'].apply(lambda x: return_sia_compound_values(x))
tweets_metrics['stopwords_count'] = tweets_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
tweets_metrics['punctuation_count'] = tweets_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = tweets_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
tweets_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = tweets_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
tweets_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
tweets_metrics['longest_word_length_without_stopwords'] = tweets_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
tweets_metrics['stopword_word_ratio'] = tweets_metrics['stopwords_count'] / tweets_metrics['amount_of_words']

tweets_metrics['adjectives_count'] = tweets_metrics['text'].apply(get_adjectives)
tweets_metrics['nouns_count'] = tweets_metrics['text'].apply(get_nouns)
tweets_metrics['verbs_count'] = tweets_metrics['text'].apply(get_verbs)
tweets_metrics['adverbs_count'] = tweets_metrics['text'].apply(get_adverbs)

In [8]:
tweets_metrics.to_csv('train_features.csv', index=False)

In [9]:
test_metrics = test[['id','text']]
test_metrics['text_without_stopwords'] = test_metrics['text'].str.split()
test_metrics['text_without_stopwords'] = test_metrics['text_without_stopwords'].apply(remove_stopword)

test_metrics['length'] = test_metrics['text'].apply(lambda x: len(x))
test_metrics['avg_word_length'] = test_metrics['text'].str.split().apply(lambda x: [len(y) for y in x]).transform(lambda x: np.mean(x))
test_metrics['amount_of_words'] = test_metrics['text'].str.split().transform(lambda x: len(x))
unique_words_by_tweet = test_metrics['text'].transform(lambda x: x.split()).transform(lambda x: pd.Series(x).unique()).transform(lambda x: len(x))
test_metrics['amount_of_unique_words'] = unique_words_by_tweet
test_metrics['sentiment'] = test_metrics['text'].apply(lambda x: return_sia_compound_values(x))
test_metrics['stopwords_count'] = test_metrics['text'].apply(lambda x: len([word for word in str(x).lower().split() if word in stopwords]))
test_metrics['punctuation_count'] = test_metrics['text'].apply(lambda x: amount_of_punctuation(x))
mentions = test_metrics['text'].str.findall(r'@.\S*?(?=\s|[:]|$)').to_frame()
test_metrics['mentions_count'] = mentions['text'].apply(lambda x: len(x))
hashtags = test_metrics['text'].str.findall(r'#[^?\s].*?(?=\s|$)')
test_metrics['hashtags_count'] = hashtags.apply(lambda x: len(x))
test_metrics['longest_word_length_without_stopwords'] = test_metrics['text_without_stopwords'].apply(lambda x: ([len(word) for word in str(x).lower().split() if not word.startswith('http')])).apply(lambda x: max(x) if len(x) > 0 else 0)
test_metrics['stopword_word_ratio'] = test_metrics['stopwords_count'] / test_metrics['amount_of_words']

test_metrics['adjectives_count'] = test_metrics['text'].apply(get_adjectives)
test_metrics['nouns_count'] = test_metrics['text'].apply(get_nouns)
test_metrics['verbs_count'] = test_metrics['text'].apply(get_verbs)
test_metrics['adverbs_count'] = test_metrics['text'].apply(get_adverbs)

In [10]:
test_metrics.to_csv('test_features.csv', index=False)

### Adición de features de keyword. [Word2Vec]

In [316]:
tweets['keyword'] = tweets['keyword'].fillna('NULL')
test['keyword'] = test['keyword'].fillna('NULL')

In [317]:
keywords = tweets[['keyword', 'id']]
keywords.head()

Unnamed: 0,keyword,id
0,,1
1,,4
2,,5
3,,6
4,,7


In [318]:
keyword_tokens = keywords.keyword.unique().tolist()

In [319]:
keyword_tokens[0]

'null'

In [320]:
len(keyword_tokens)

222

In [321]:
from gensim.models import Word2Vec

# Parameters
# sg ({0, 1}, optional) - Training algorithm: 1 for skip-gram; otherwise CBOW.

keyword_vectors = Word2Vec([keyword_tokens], min_count=1, size= 100, workers=3, window =3, sg=1)

In [322]:
# Sanity check
keyword_vectors['ablaze']

  


array([ 2.5983190e-03, -6.6586147e-04, -2.3209248e-03,  1.7518990e-03,
       -2.2995241e-03,  8.5558771e-04,  5.0506066e-04, -1.3033998e-03,
        4.9609276e-03, -3.3725437e-03, -2.0172056e-03,  2.4199090e-03,
        3.8925724e-03,  2.4426566e-03, -4.6416828e-03,  2.9733980e-03,
        4.8849769e-03,  3.0582242e-03, -1.2348567e-03,  1.3275865e-03,
       -4.0353765e-03, -2.7007733e-03, -4.7994917e-03, -1.7706358e-03,
        7.0335745e-04, -2.8691879e-03, -2.1185733e-03,  2.5431931e-03,
        7.8616978e-04,  3.8344462e-03,  4.2495583e-03,  2.7264808e-03,
       -8.9071103e-04, -7.8725221e-04, -1.2452446e-03, -1.7115731e-03,
        1.8372950e-03, -2.0356460e-03,  3.4612773e-03,  1.4445643e-03,
        1.1148887e-03, -4.5582252e-03,  9.5972250e-04, -7.9843868e-04,
       -5.0058635e-04, -4.2120512e-03,  4.0654014e-03,  3.1079892e-03,
       -1.8416835e-03, -2.4908367e-03,  4.8450697e-03, -6.7695527e-04,
       -4.5575514e-03, -3.4133701e-03, -4.7050603e-03, -1.7676153e-03,
      

In [323]:
to_vector_matrix = {}

for k in keyword_tokens:
    to_vector_matrix[k] = keyword_vectors[k]

  after removing the cwd from sys.path.


In [324]:
keyword_w2v = pd.DataFrame.from_dict(to_vector_matrix).T.reset_index()
keyword_w2v

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,,-0.002577,-0.001227,-0.001444,0.000423,-0.003910,0.003864,-0.000201,-0.002421,0.001415,...,0.004344,0.004172,0.002988,-0.001633,-0.000744,0.001731,-0.000859,0.000612,0.000783,0.000369
1,ablaze,0.002598,-0.000666,-0.002321,0.001752,-0.002300,0.000856,0.000505,-0.001303,0.004961,...,-0.001034,0.002820,0.000984,0.002583,0.002405,-0.000621,0.003258,-0.002784,0.001693,0.004333
2,accident,-0.003349,0.003251,0.004571,-0.000547,0.002326,0.002836,0.000025,-0.000738,0.003825,...,0.001687,0.003747,0.004781,0.001277,-0.004880,0.004659,0.000265,-0.004970,-0.000646,-0.001960
3,aftershock,-0.004262,0.002142,0.001838,0.002965,-0.002146,-0.001892,-0.000939,0.001145,-0.003004,...,-0.000615,0.001087,-0.003613,-0.001999,-0.004111,0.003691,-0.000786,0.002645,0.001593,0.003497
4,airplane accident,-0.001462,-0.002390,-0.003875,0.001232,0.002163,-0.002089,0.000851,0.000408,0.000895,...,-0.002294,-0.004053,0.004961,-0.002474,0.000193,0.001964,-0.003745,-0.001532,0.002234,0.004583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
217,wounded,-0.001203,-0.000257,-0.001077,0.000290,0.000039,0.000181,0.001959,0.000600,-0.001591,...,-0.004437,0.001071,0.002956,-0.002226,-0.003644,-0.000114,-0.002932,-0.000425,0.002893,0.003686
218,wounds,-0.002723,-0.003479,0.004881,-0.003744,0.004931,0.003912,-0.002686,0.004263,-0.003562,...,0.004472,0.003746,-0.002081,0.002626,0.003337,0.000401,-0.001013,-0.004865,0.004028,0.001501
219,wreck,-0.004954,0.002817,-0.003835,-0.004148,0.000410,-0.000847,-0.002482,0.004491,-0.004849,...,0.000610,0.004922,-0.002649,-0.004030,-0.001320,-0.003117,-0.003942,-0.001964,0.002089,0.003643
220,wreckage,-0.002716,0.002877,0.000625,0.003528,-0.000802,0.001631,0.004361,0.004944,-0.003616,...,-0.004664,0.001344,-0.003860,0.001019,0.001464,-0.001884,0.003862,-0.004642,0.000096,-0.000189


In [325]:
aux = []
aux.append('keyword')
for i in range (0, 100):
    name = 'v' + str(i)
    aux.append(name)
len(aux)

101

In [326]:
keyword_w2v.columns = aux

In [327]:
keyword_w2v.head()

Unnamed: 0,keyword,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99
0,,-0.002577,-0.001227,-0.001444,0.000423,-0.00391,0.003864,-0.000201,-0.002421,0.001415,...,0.004344,0.004172,0.002988,-0.001633,-0.000744,0.001731,-0.000859,0.000612,0.000783,0.000369
1,ablaze,0.002598,-0.000666,-0.002321,0.001752,-0.0023,0.000856,0.000505,-0.001303,0.004961,...,-0.001034,0.00282,0.000984,0.002583,0.002405,-0.000621,0.003258,-0.002784,0.001693,0.004333
2,accident,-0.003349,0.003251,0.004571,-0.000547,0.002326,0.002836,2.5e-05,-0.000738,0.003825,...,0.001687,0.003747,0.004781,0.001277,-0.00488,0.004659,0.000265,-0.00497,-0.000646,-0.00196
3,aftershock,-0.004262,0.002142,0.001838,0.002965,-0.002146,-0.001892,-0.000939,0.001145,-0.003004,...,-0.000615,0.001087,-0.003613,-0.001999,-0.004111,0.003691,-0.000786,0.002645,0.001593,0.003497
4,airplane accident,-0.001462,-0.00239,-0.003875,0.001232,0.002163,-0.002089,0.000851,0.000408,0.000895,...,-0.002294,-0.004053,0.004961,-0.002474,0.000193,0.001964,-0.003745,-0.001532,0.002234,0.004583


In [329]:
keyword_w2v.to_csv('keyword_w2v_features.csv', index=False)

### Coordenadas

In [245]:
locations = pd.read_csv("../TP1/locations.csv", usecols=['location', 'point'])

In [246]:
locations.fillna('null', inplace=True)

In [247]:
empty_loc = locations.loc[0, 'location']
locations.replace(empty_loc, 'null', inplace=True)

In [248]:
locations.head()

Unnamed: 0,location,point
0,,
1,glasgow,"(55.8609825, -4.2488787, 0.0)"
2,"melbourne, australia","(-37.8142176, 144.9631608, 0.0)"
3,news,"(49.04172215, -122.27255349013137, 0.0)"
4,alberta,"(55.001251, -115.002136, 0.0)"


In [249]:
def point_to_list(point):
    if point == 'null':
        return [float('inf'), float('inf')]
    
    coordinates = []
    aux = point[:]
    row = aux.strip( '()' ).split(',')
    coordinates.append(float(row[0]))
    coordinates.append(float(row[1]))
    return coordinates

In [250]:
locations['point'] = locations.point.apply(point_to_list)

In [251]:
aux = locations.point.apply(pd.Series)
aux.columns = ['x', 'y']

In [252]:
locations['x'] = aux['x']
locations['y'] = aux['y']

In [253]:
locations.head()

Unnamed: 0,location,point,x,y
0,,"[inf, inf]",inf,inf
1,glasgow,"[55.8609825, -4.2488787]",55.860982,-4.248879
2,"melbourne, australia","[-37.8142176, 144.9631608]",-37.814218,144.963161
3,news,"[49.04172215, -122.27255349013137]",49.041722,-122.272553
4,alberta,"[55.001251, -115.002136]",55.001251,-115.002136


In [254]:
tweets['location'] = tweets['location'].apply(str.lower)
test['location'] = test['location'].apply(str.lower)

In [255]:
coordinates_train = tweets.merge(locations.loc[:, ['location', 'x', 'y']], left_on='location', right_on='location', how='left').loc[:, ['id', 'x', 'y']]
coordinates_test = test.merge(locations.loc[:, ['location', 'x', 'y']], left_on='location', right_on='location', how='left').loc[:, ['id', 'x', 'y']]

In [256]:
coordinates_train.fillna(float('inf'), inplace=True)
coordinates_test.fillna(float('inf'), inplace=True)

In [269]:
metrics_with_xy = tweets_metrics.merge(coordinates_train, left_on='id', right_on='id')
test_metrics_with_xy = test_metrics.merge(coordinates_test, left_on='id', right_on='id')

In [271]:
metrics_with_xy.to_csv('train_features_xy.csv', index=False)
test_metrics_with_xy.to_csv('test_features_xy.csv', index=False)

### Keywords as features

In [1477]:
tweets['keyword'] = tweets.keyword.str.replace('%20',' ')
tweets.keyword.fillna('null', inplace=True)

In [290]:
disaster_list = list(tweets['keyword'].unique())
len(disaster_list)

222

In [303]:
crash = ['collide', 'collided', 'collision', 'crash', 'crashed', 'wreck', 'wreckage', 'wrecked']

emergency = ['emergency', 'emergency plan']

electricity = ['electrocute', 'electrocuted',]

helpers = ['ambulance', 'police', 'siren', 'sirens', 'emergency services', 'first responders',\
           'stretcher', 'eyewitness', 'rescuers']

panic = ['screamed', 'screaming', 'screams', 'panic', 'mayhem', 'riot', 'rioting', 'fear', 'panicking', 'trauma',\
         'trouble', 'hail', 'pandemonium']

hostages = ['hostage', 'hostages', 'trapped']

quarentine = ['quarantine', 'quarantined']

colapse = ['bridge collapse', 'collapse', 'collapsed', 'demolish', 'demolished', 'demolition', 'structural failure']

accident = ['accident', 'airplane accident', 'derail', 'derailed', 'derailment', 'oil spill']

fire = ['ablaze', 'hellfire', 'smoke', 'wild fires', 'wildfire', 'buildings burning',\
        'buildings on fire', 'burned', 'burning', 'burning buildings', 'bush fires', 'fire',\
        'fire truck', 'flames', 'forest fire', 'forest fires', 'blaze', 'blazing', 'arson', 'arsonist']

nuclear = ['nuclear disaster', 'nuclear reactor', 'radiation emergency', 'meltdown']

explotion = ['explode', 'exploded', 'explosion', 'blown up', 'blew up', 'loud bang']

survivor = ['survive', 'survived', 'rescue', 'rescued', 'survivors', 'evacuate', 'evacuated', 'evacuation', 'refugees']

wounded = ['wounded', 'wounds', 'bleeding', 'bloody', 'injured', 'injuries', 'injury', 'traumatised', 'blood']

bomb = ['suicide bomb', 'suicide bomber', 'suicide bombing', 'bomb', 'bombed', 'bombing', 'detonate', 'detonation']

storm = ['storm', 'thunderstorm', 'thunder', 'rainstorm', 'violent storm', 'windstorm', 'lightning', 'hailstorm']

water = ['flood', 'flooding', 'floods', 'inundated', 'inundation', 'sinking', 'drown', 'drowned', 'drowning', 'sunk']

natural_disaster = ['heat wave','sandstorm', 'seismic' ,'avalanche', 'tsunami', 'twister',\
                    'typhoon',  'tornado', 'hurricane', 'natural disaster', 'cyclone', 'volcano',\
                    'drought', 'dust storm', 'earthquake',  'lava', 'aftershock', 'snowstorm', 'blizzard',\
                    'whirlwind', 'upheaval',  'landslide', 'cliff fall', 'mudslide', 'sinkhole', 'displaced',\
                    'epicentre']

attack = ['attack', 'attacked']

casualties = ['mass murder', 'mass murderer', 'massacre', 'fatal', 'fatalities', 'fatality', 'casualties',\
              'casualty', 'body bag', 'body bagging', 'body bags', 'dead', 'death', 'deaths',  'tragedy']

terrorism = ['terrorism', 'terrorist', 'threat', 'hijack', 'hijacker', 'hijacking', 'bioterror', 'bioterrorism']

destruction = ['destroyed', 'destruction', 'devastated',\
               'devastation', 'disaster', 'annihilated', 'annihilation', 'apocalypse',\
               'armageddon', 'catastrophe', 'catastrophic', 'obliterate', 'obliterated',\
               'obliteration', 'damage', 'destroy', 'desolate', 'desolation', 'blight',\
               'harm', 'hazard', 'hazardous', 'danger', 'ruin', 'engulfed', 'rubble', 'debris',\
               'razed', 'flattened', 'crush', 'crushed']

warlike = ['war zone', 'weapon', 'weapons', 'military', 'army', 'battle', 'outbreak', 'chemical emergency', 'curfew']

starvation = ['famine', 'deluge', 'deluged']


null = ['null']

In [304]:
disasters = [crash, emergency, electricity, helpers, panic, hostages, quarentine, colapse, accident, fire,\
             nuclear, explotion, survivor, wounded, bomb, storm, water, natural_disaster, attack, casualties,\
             terrorism, destruction, warlike, starvation, null]

In [315]:
count = 0
for l in disasters:
    count += len(l)
count

222