# Imports

In [2]:
import numpy as np
import pandas as pd
import re
import nltk #pip install nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Funciones

In [3]:
porter=nltk.PorterStemmer()

## keyword

In [4]:
def clean_keyword(keyword):
    
    if keyword is not np.nan:
        keyword = re.sub("%20", " ", keyword)
        
        token_keyword = nltk.word_tokenize(keyword)
        new_keyword = []
        
        for word in token_keyword:
            if word not in stopwords.words('english'):
                new_keyword.append(porter.stem(word))

        keyword = " ".join(new_keyword)
        
    return keyword

## location

In [5]:
def clean_location(location):
    if location is not np.nan:
        location = location.lower()
        location = re.sub("[^a-zñ, -]", "", location)
        location = re.sub("(^ +)|( +$)", "", location)
        location = re.sub("[,-]", " ", location)
        location = re.sub(r"\busa*\b", "united states of america", location)
        location = re.sub(r"\buk\b", "united kingdom", location)
        location = re.sub(r"\bny\b", "new york", location)
        location = re.sub(r"\bnyc\b", "new york city", location)
        location = re.sub(r"\bca\b", "california", location)
        location = re.sub(r"\bnc\b", "north carolina", location)
        location = re.sub(r"\btx\b", "texas", location)
        location = re.sub(r"\bga\b", "georgia", location)
        location = re.sub(r"\bncr\b", "national capital region of india", location)
        location = re.sub(r"\bfl\b", "florida", location)
        location = re.sub(r"\bsfo\b", "san francisco", location)
        location = re.sub(r"\bco\b", "colorado", location)
        location = re.sub(r"\boh\b", "ohio", location)
        location = re.sub(r"\bok\b", "oklahoma", location)
        
        location = re.sub(" +", " ", location)
    
    return location

## text

In [6]:
def replace_text(text):
    text = text.replace('&amp', 'and')
    text = text.replace('amp;', 'and')
    text = text.replace('~', ' ')
    text = text.replace('Ûª', '\'')
    text = text.replace('ÛÒ', ' ')
    text = text.replace('ÛÓ', ' ')
    text = text.replace('&gt;', ' ')
    text = text.replace('&lt;', ' ')
    text = text.replace('ÛÏ', ' ')
    text = text.replace('Û', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('å«', '\'')
    text = text.replace('åÊ', ' ')
    text = text.replace('åÇ', ' ')
    text = text.replace('Ì©', 'e')
    text = text.replace('Ì¤', 'c')
    text = text.replace('Ì¼', 'u')
    text = text.replace('Ì_', 'o')
    text = text.replace('`', '\'' )
    text = text.replace('\x89', '')
    text = text.replace('\x9d', '')
    
    return text

In [7]:
one_letter="adhijklqruvwxyz"
double_letters="bcefgmnopst"

In [1]:
def regular_expressions(text):
    text = text.lower()
    text = re.sub("(http|https)\S+", "link", text)
    text = re.sub(" can(\'*)t ", " cannot ", text)
    text = re.sub("n\'t ", " not ", text)
    text = re.sub("\'s ", " is ", text)
    text = re.sub("\'re ", " are ", text)
    text = re.sub("\'ve ", " have ", text)
    text = re.sub("\'d ", " would ", text)
    text = re.sub("\'m ", " am ", text)
    text = re.sub("\.|/|\+|-|÷|\*|:", " ", text)
    text = re.sub("( ' )|( # )", " ", text)
    text = re.sub("[^a-z 0-9]", "", text)
    return text

In [9]:
def clean_text(text):
    
    text = regular_expressions(text)
    
    for letter in one_letter:
        re_gex = letter + "(" + letter + "+)"
        text = re.sub(re_gex, letter, text)
        
    #Generalizamos para todas las palabras que contengan más de 2 letras iguales
    for letter in double_letters:
        double = letter + letter
        re_gex = double + "+"
        text = re.sub(re_gex, double, text)
    
    split_text = re.split("(\d+)", text)
    text = " ".join(split_text)
        
    text = re.sub("[0-9]\d*", "number", text)
    
    token_text = nltk.word_tokenize(text)    
    new_text = []
    for word in token_text:
        if word not in stopwords.words('english'):
            new_text.append(porter.stem(word))
    
    text = " ".join(new_text)
    text = re.sub(" +", " ", text) #Espacios múltiples
    
    return text

## Extras

In [10]:
english = 'abcdefghijklmnopqrstuvwxyz0123456789 '
len_english = len(english)

In [11]:
def search_text_samples(col_text):
    text = {}
    len_col_text = len(col_text)
    for i in range(0, len_col_text):
        len_text = len(col_text.iloc[i])
        for j in range(0, len_text):
            lower_case = col_text.iloc[i].lower()
            special_character = []
            if lower_case[j] not in english:
                if lower_case[j] in text:
                    special_character = text[lower_case[j]]
                special_character.append(i)
                text[lower_case[j]] = special_character
    return text

In [12]:
def show_text_samples(col_text, text):
    for key in text.keys():
        sample = text[key][0]
        print(col_text.iloc[sample])

# Train

In [63]:
train = pd.read_csv('train/train_original.csv')
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## keyword

In [64]:
train['keyword'].unique()[::5]

array([nan, 'ambulance', 'army', 'avalanche', 'blazing', 'blood',
       'body%20bags', 'buildings%20burning', 'bush%20fires',
       'chemical%20emergency', 'collided', 'crushed', 'dead', 'deluged',
       'derailed', 'destroyed', 'devastation', 'drowned', 'electrocuted',
       'epicentre', 'exploded', 'fatalities', 'first%20responders',
       'floods', 'harm', 'hijack', 'hurricane', 'inundation',
       'mass%20murder', 'military', 'obliterate', 'pandemonium',
       'quarantined', 'rescue', 'rubble', 'screams', 'sirens',
       'structural%20failure', 'survive', 'threat', 'trapped', 'twister',
       'war%20zone', 'wildfire', 'wreckage'], dtype=object)

In [65]:
train['keyword'] = train['keyword'].apply(clean_keyword)

In [66]:
train['keyword'].unique()

array([nan, 'ablaz', 'accid', 'aftershock', 'airplan accid', 'ambul',
       'annihil', 'apocalyps', 'armageddon', 'armi', 'arson', 'arsonist',
       'attack', 'avalanch', 'battl', 'bioterror', 'blaze', 'bleed',
       'blew', 'blight', 'blizzard', 'blood', 'bloodi', 'blown',
       'bodi bag', 'bomb', 'bridg collaps', 'build burn', 'build fire',
       'burn', 'burn build', 'bush fire', 'casualti', 'catastroph',
       'chemic emerg', 'cliff fall', 'collaps', 'collid', 'collis',
       'crash', 'crush', 'curfew', 'cyclon', 'damag', 'danger', 'dead',
       'death', 'debri', 'delug', 'demolish', 'demolit', 'derail',
       'desol', 'destroy', 'destruct', 'deton', 'devast', 'disast',
       'displac', 'drought', 'drown', 'dust storm', 'earthquak',
       'electrocut', 'emerg', 'emerg plan', 'emerg servic', 'engulf',
       'epicentr', 'evacu', 'explod', 'explos', 'eyewit', 'famin',
       'fatal', 'fear', 'fire', 'fire truck', 'first respond', 'flame',
       'flatten', 'flood', 'fores

In [67]:
len(train['keyword'].unique())

167

## location

In [68]:
train['location'].unique()[::30]

array([nan, 'Santa Clara, CA', 'Oshawa, Canada', 'italy', 'Boston',
       'Elk Grove, CA, USA', 'Helsinki, Finland',
       'North-East Region, Singapore', 'West Bank, Gaza Strip',
       'The Hammock, FL, USA', 'Washington D.C.', 'Atlanta,Ga',
       'Swag Francisco', '?205?478?', 'Scotland', 'North Jersey',
       'Oklahoma', 'New Your', 'Menlo Park. SFO. The World.',
       'MY RTs ARE NOT ENDORSEMENTS', 'Pittsburgh PA',
       'taken by piper curda', '956', 'Whiterun, Skyrim',
       'Hoxton, London', 'Hartford  London Hong Kong', 'N?? Y???.',
       'Abuja, Nigeria', 'Pompano Beach, FL', 'Fort Smith, AR',
       'On the court ', '21.462446,-158.022017', 'San Antonio, TX',
       'Ankara - Malatya - ad Orontem', 'Hartford,  connecticut', 'USAoV',
       'UPTOWN ', 'the void, U.S.A', '518', 'Otsego, MI',
       'aggressive cannoli eater ', 'Anchorage, AK', 'Michigan, USA',
       'Johannesburg ', 'Patra-Greece.', "Sharkatraz/Bindle's Cleft, PA",
       'ITALY', 'UK  & Germany', 'Wh

In [69]:
train['location'] = train['location'].replace('M!$$!$$!PP!', 'Mississippi')
train['location'] = train['location'].apply(clean_location)

In [70]:
train['location'].unique()[::30]

array([nan, 'santa clara california', 'baker city oregon', 'toronto',
       'the own zone layer', 'austin texas', 'perthshire', 'atlanta',
       'ph', 'brasil', 'washington dc area', 'lima peru', 'indiana',
       'me mammys belly', 'biloxi mississippi', 'georgia', 'miami',
       'gia kardashianempire', 'puerto rico', 'new orleans louisiana',
       'belleville illinois', 'whiterun skyrim',
       'las vegas nv united states of america', 'wellington new zealand',
       'wales', 'draw a circle thats the earth', 'spokane wa',
       'protectingtitans side', 'th st ss', 'san diego texas',
       'illinois united states of america', 'columbia sc',
       'orlando florida', 'asuncin py tbingen ger', 'ct new york city',
       'fruit bowl', 'atlantic ia', 'coimbatore', 'chile',
       'georgia united states of america', 'the meadow',
       'nearest trash can', 'lynwood california', 'ct new york',
       'tchira venezuela', 'mumbai maharashtra',
       'united states of america northern 

In [71]:
len(train['location'].unique())

2897

## text

In [72]:
text = search_text_samples(train['text'])
show_text_samples(train['text'], text)

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Forest fire near La Ronge Sask. Canada
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
13,000 people receive #wildfires evacuation orders in California 
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
What's up man?
What a goooooooaaaaaal!!!!!!
London is cool ;)
London is cool ;)
Cooool :)
@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J
Barbados #Bridg

In [73]:
train['text'] = train['text'].apply(replace_text)

In [74]:
train['text'] = train['text'].apply(regular_expressions)

In [75]:
show_text_samples(train['text'], text)

our deeds are the reason of this earthquake may allah forgive us all
forest fire near la ronge sask  canada
all residents asked to shelter in place are being notified by officers  no other evacuation or shelter in place orders are expected
13000 people receive wildfires evacuation orders in california 
rockyfire update  california hwy  20 closed in both directions due to lake county fire   cafire wildfires
rockyfire update  california hwy  20 closed in both directions due to lake county fire   cafire wildfires
rockyfire update  california hwy  20 closed in both directions due to lake county fire   cafire wildfires
what is up man
what a goooooooaaaaaal
london is cool 
london is cool 
cooool  
bbcmtd wholesale markets ablaze 
bbcmtd wholesale markets ablaze 
barbados bridgetown jamaica   two cars set ablaze  santa cruz   head of the st elizabeth police superintende     
barbados bridgetown jamaica   two cars set ablaze  santa cruz   head of the st elizabeth police superintende     
barba

## Resultado

In [76]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1
7609,10870,,,ariaahrary thetawniest the out of control wild...,1
7610,10871,,,m1 94 01 04 utc5km s of volcano hawaii,1
7611,10872,,,police investigating after an e bike collided ...,1


## Guardado del dataframe

In [77]:
train.to_csv('train/train_limpio.csv', index=False)

# Test

In [13]:
test = pd.read_csv('test/test_original.csv')

In [14]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


## Keyword

In [15]:
test['keyword'].unique()[::5]

array([nan, 'ambulance', 'army', 'avalanche', 'blazing', 'blood',
       'body%20bags', 'buildings%20burning', 'bush%20fires',
       'chemical%20emergency', 'collided', 'crushed', 'dead', 'deluged',
       'derailed', 'destroyed', 'devastation', 'drowned', 'electrocuted',
       'epicentre', 'exploded', 'fatalities', 'first%20responders',
       'floods', 'harm', 'hijack', 'hurricane', 'inundation',
       'mass%20murder', 'military', 'obliterate', 'pandemonium',
       'quarantined', 'rescue', 'rubble', 'screams', 'sirens',
       'structural%20failure', 'survive', 'threat', 'trapped', 'twister',
       'war%20zone', 'wildfire', 'wreckage'], dtype=object)

In [16]:
test['keyword'] = test['keyword'].apply(clean_keyword)

In [17]:
test['keyword'].unique()

array([nan, 'ablaz', 'accid', 'aftershock', 'airplan accid', 'ambul',
       'annihil', 'apocalyps', 'armageddon', 'armi', 'arson', 'arsonist',
       'attack', 'avalanch', 'battl', 'bioterror', 'blaze', 'bleed',
       'blew', 'blight', 'blizzard', 'blood', 'bloodi', 'blown',
       'bodi bag', 'bomb', 'bridg collaps', 'build burn', 'build fire',
       'burn', 'burn build', 'bush fire', 'casualti', 'catastroph',
       'chemic emerg', 'cliff fall', 'collaps', 'collid', 'collis',
       'crash', 'crush', 'curfew', 'cyclon', 'damag', 'danger', 'dead',
       'death', 'debri', 'delug', 'demolish', 'demolit', 'derail',
       'desol', 'destroy', 'destruct', 'deton', 'devast', 'disast',
       'displac', 'drought', 'drown', 'dust storm', 'earthquak',
       'electrocut', 'emerg', 'emerg plan', 'emerg servic', 'engulf',
       'epicentr', 'evacu', 'explod', 'explos', 'eyewit', 'famin',
       'fatal', 'fear', 'fire', 'fire truck', 'first respond', 'flame',
       'flatten', 'flood', 'fores

In [18]:
len(test['keyword'].unique())

167

## Location

In [19]:
len(test['location'].unique())

1603

In [20]:
test['location'].unique()[::50]

array([nan, 'USA ', 'Chicago, IL', 'East Texas', 'Oh.',
       '? the Foothills of SC ?', 'Somewhere on the Earth',
       'gateway regional hs', 'Leanbox?', 'The Open road!',
       'Toronto, Ontario, Canada', 'Camberville (Bostonish)',
       'Cardiff/London/NYC/Warwick', 'Please follow and RT! :)',
       'FEMA REGION 2', 'Isle of Patmos', 'West Palm Beach, FL',
       'Santa Cruz, CA', 'Desloge, Mo', 'Dark Night. ???? ??',
       'Limerick, Ireland.', 'Wickford', 'The Woodlands, TX',
       'trashcan somewhere in hell', '9.25.14?8.5.15?10.6.15 | gen?',
       ' Kaijo High School', 'Rice Lake, WI/Toronto, ON', 'NJ, USA',
       'St. Catharines', 'OKC', 'Okinawa', 'Sheffield.?',
       'Acey mountain islanddåÇTorontoåÈ'], dtype=object)

In [21]:
test['location'] = test['location'].apply(clean_location)

In [22]:
test['location'].unique()[::50]

array([nan, 'worldwide', 'reality based world',
       'denver colorado united states of america', 'twitchtvdgnesports',
       'dont stalk me thanks', 'cin city', 'mid north coast of nsw',
       'eugene or', 'piscataway nj', 'eastbourne east sussex',
       'st louis missouri', 'palacio madrid', 'jupiter',
       'paonia colorado', 'alexander iowa', 'lampe mo', 'stgo chile',
       'california central valley', 'beijing china', 'dili east timor',
       'theythem', 'colorado the mile high city', 'benua ltd',
       'london herts', 'virginia beach va united states of america',
       'portland oregon united states of america', 'louavul ky',
       'where eer the mood takes me'], dtype=object)

In [23]:
len(test['location'].unique())

1437

## Text

In [24]:
text = search_text_samples(test['text'])
show_text_samples(test['text'], text)

Heard about #earthquake is different cities, stay safe everyone.
Heard about #earthquake is different cities, stay safe everyone.
Heard about #earthquake is different cities, stay safe everyone.
We're shaking...It's an earthquake
They'd probably still show more life than Arsenal did yesterday, eh? EH?
Hey! How are you?
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU
@sunkxssedharry will you wear shorts for race ablaze ?
#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI
#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI
#PreviouslyOnDoyinTv: Toke MakinwaÛªs ma

In [25]:
test['text'] = test['text'].apply(replace_text)

In [26]:
test['text'] = test['text'].apply(regular_expressions)

In [27]:
show_text_samples(test['text'], text)

heard about earthquake is different cities stay safe everyone 
heard about earthquake is different cities stay safe everyone 
heard about earthquake is different cities stay safe everyone 
we are shaking   it is an earthquake
they would probably still show more life than arsenal did yesterday eh eh
hey how are you
birmingham wholesale market is ablaze bbc news   fire breaks out at birmingham is wholesale market 
birmingham wholesale market is ablaze bbc news   fire breaks out at birmingham is wholesale market 
birmingham wholesale market is ablaze bbc news   fire breaks out at birmingham is wholesale market 
sunkxssedharry will you wear shorts for race ablaze 
previouslyondoyintv  toke makinwa is marriage crisis sets nigerian twitter ablaze    
previouslyondoyintv  toke makinwa is marriage crisis sets nigerian twitter ablaze    
previouslyondoyintv  toke makinwa is marriage crisis sets nigerian twitter ablaze    
psa  i am splitting my personalities    techies follow ablazeco  burners 

## Guardado del dataframe

In [28]:
test.to_csv('test/test_limpio.csv', index=False)