# Imports

In [77]:
import numpy as np
import pandas as pd
import re
import nltk #pip install nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ben/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Funciones

In [78]:
porter=nltk.PorterStemmer()

## keyword

In [79]:
def clean_space(keyword):
    if keyword is not np.nan:
        keyword = re.sub("%20", " ", keyword)
    return keyword

In [80]:
def clean_keyword(keyword):
    
    keyword = clean_space(keyword)
    
    if keyword is not np.nan:
                
        token_keyword = nltk.word_tokenize(keyword)
        new_keyword = []
        
        for word in token_keyword:
            if word not in stopwords.words('english'):
                new_keyword.append(porter.stem(word))

        keyword = " ".join(new_keyword)
        
    return keyword

## location

In [81]:
def clean_location(location):
    if location is not np.nan:
        location = location.lower()
        location = re.sub("[^a-zñ, -]", "", location)
        location = re.sub("(^ +)|( +$)", "", location)
        location = re.sub("[,-]", " ", location)
        location = re.sub(r"\busa*\b", "united states of america", location)
        location = re.sub(r"\buk\b", "united kingdom", location)
        location = re.sub(r"\bny\b", "new york", location)
        location = re.sub(r"\bnyc\b", "new york city", location)
        location = re.sub(r"\bca\b", "california", location)
        location = re.sub(r"\bnc\b", "north carolina", location)
        location = re.sub(r"\btx\b", "texas", location)
        location = re.sub(r"\bga\b", "georgia", location)
        location = re.sub(r"\bncr\b", "national capital region of india", location)
        location = re.sub(r"\bfl\b", "florida", location)
        location = re.sub(r"\bsfo\b", "san francisco", location)
        location = re.sub(r"\bco\b", "colorado", location)
        location = re.sub(r"\boh\b", "ohio", location)
        location = re.sub(r"\bok\b", "oklahoma", location)
        location = re.sub(r"\bnj\b", "new jersey", location)
        location = re.sub(r"\bil\b", "illinois", location)
        location = re.sub(r"\bbc\b", "british columbia", location)
        location = re.sub(r"\bnv\b", "nevada", location)
        location = re.sub(r"\bwy\b", "wyoming", location)
        
        location = re.sub(" +", " ", location)
    
    return location

## text

In [82]:
def replace_text(text):
    text = text.replace('&amp', 'and')
    text = text.replace('amp;', 'and')
    text = text.replace('~', ' ')
    text = text.replace('Ûª', '\'')
    text = text.replace('ÛÒ', ' ')
    text = text.replace('ÛÓ', ' ')
    text = text.replace('&gt;', ' ')
    text = text.replace('&lt;', ' ')
    text = text.replace('ÛÏ', ' ')
    text = text.replace('Û', ' ')
    text = text.replace('\n', ' ')
    text = text.replace('å«', '\'')
    text = text.replace('åÊ', ' ')
    text = text.replace('åÇ', ' ')
    text = text.replace('Ì©', 'e')
    text = text.replace('Ì¤', 'c')
    text = text.replace('Ì¼', 'u')
    text = text.replace('Ì_', 'o')
    text = text.replace('`', '\'' )
    text = text.replace('\x89', '')
    text = text.replace('\x9d', '')
    
    return text

In [83]:
one_letter="adhijklqruvwxyz"
double_letters="bcefgmnopst"

In [84]:
def regular_expressions(text):
    text = text.lower()
    text = re.sub("(http|https)\S+", "link", text)
    text = re.sub(" can(\'*)t ", " cannot ", text)
    text = re.sub("n\'t ", " not ", text)
    text = re.sub("\'s ", " is ", text)
    text = re.sub("\'re ", " are ", text)
    text = re.sub("\'ve ", " have ", text)
    text = re.sub("\'d ", " would ", text)
    text = re.sub("\'m ", " am ", text)
    text = re.sub("\.|/|\+|-|÷|\*|:", " ", text)
    text = re.sub("( ' )|( # )", " ", text)
    text = re.sub("[^a-z 0-9]", "", text)
    
    for letter in one_letter:
        re_gex = letter + "(" + letter + "+)"
        text = re.sub(re_gex, letter, text)
        
    #Generalizamos para todas las palabras que contengan más de 2 letras iguales
    for letter in double_letters:
        double = letter + letter
        re_gex = double + "+"
        text = re.sub(re_gex, double, text)
    
    split_text = re.split("(\d+)", text)
    text = " ".join(split_text)
        
    text = re.sub("[0-9]\d*", "number", text)
    
    return text

In [85]:
def clean_text(text):
    
    text = regular_expressions(text)
    
    token_text = nltk.word_tokenize(text)    
    new_text = []
    for word in token_text:
        if word not in stopwords.words('english'):
            new_text.append(porter.stem(word))
    
    text = " ".join(new_text)
    text = re.sub(" +", " ", text) #Espacios múltiples
    
    return text

## Extras

In [86]:
english = 'abcdefghijklmnopqrstuvwxyz0123456789 '
len_english = len(english)

In [87]:
def search_text_samples(col_text):
    text = {}
    len_col_text = len(col_text)
    for i in range(0, len_col_text):
        len_text = len(col_text.iloc[i])
        for j in range(0, len_text):
            lower_case = col_text.iloc[i].lower()
            special_character = []
            if lower_case[j] not in english:
                if lower_case[j] in text:
                    special_character = text[lower_case[j]]
                special_character.append(i)
                text[lower_case[j]] = special_character
    return text

In [88]:
def show_text_samples(col_text, text):
    for key in text.keys():
        sample = text[key][0]
        print(col_text.iloc[sample])

# Train

In [121]:
train = pd.read_csv('train/train_original.csv')
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## keyword

In [122]:
train['keyword_sin_stemming'] = train['keyword'].apply(clean_space)

In [123]:
train['keyword_con_stemming'] = train['keyword'].apply(clean_keyword)

## location

In [124]:
train['location'] = train['location'].replace('M!$$!$$!PP!', 'Mississippi')
train['location'] = train['location'].apply(clean_location)

## text

In [125]:
text = search_text_samples(train['text'])
show_text_samples(train['text'], text)

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
Forest fire near La Ronge Sask. Canada
All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected
13,000 people receive #wildfires evacuation orders in California 
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires
What's up man?
What a goooooooaaaaaal!!!!!!
London is cool ;)
London is cool ;)
Cooool :)
@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
Barbados #Bridgetown JAMAICA ÛÒ Two cars set ablaze: SANTA CRUZ ÛÓ Head of the St Elizabeth Police Superintende...  http://t.co/wDUEaj8Q4J
Barbados #Bridg

In [126]:
train['text'] = train['text'].apply(replace_text)

In [127]:
train['text_con_stemming'] = train['text'].apply(clean_text)

In [128]:
train['text_sin_stemming'] = train['text'].apply(regular_expressions)

In [129]:
cols = train.columns.tolist()
cols.remove('target')
cols.remove('text')
cols.remove('id')
cols.remove('keyword')
cols.append('target')
train = train[cols]

## Resultado

In [130]:
train

Unnamed: 0,location,keyword_sin_stemming,keyword_con_stemming,text_con_stemming,text_sin_stemming,target
0,,,,deed reason earthquak may alah forgiv us al,our deeds are the reason of this earthquake ma...,1
1,,,,forest fire near la rong sask canada,forest fire near la ronge sask canada,1
2,,,,al resid ask shelter place notifi offic evacu ...,al residents asked to shelter in place are bei...,1
3,,,,number peopl receiv wildfir evacu order califo...,number people receive wildfires evacuation o...,1
4,,,,got sent photo rubi alaska smoke wildfir pour ...,just got sent this photo from ruby alaska as s...,1
...,...,...,...,...,...,...
7608,,,,two giant crane hold bridg colaps nearbi home ...,two giant cranes holding a bridge colapse into...,1
7609,,,,ariahrari thetawniest control wild fire califo...,ariahrary thetawniest the out of control wild ...,1
7610,,,,number number number number utc number km volc...,m number number number number utc numbe...,1
7611,,,,polic investig e bike colid car littl portug e...,police investigating after an e bike colided w...,1


## Guardado del dataframe

In [131]:
train.to_csv('train/train_limpio.csv', index=False)

# Test

In [132]:
test = pd.read_csv('test/test_original.csv')
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


## Keyword

In [133]:
test['keyword_sin_stemming'] = test['keyword'].apply(clean_space)

In [134]:
test['keyword_con_stemming'] = test['keyword'].apply(clean_keyword)

## Location

In [135]:
test['location'] = test['location'].apply(clean_location)

## Text

In [136]:
text = search_text_samples(test['text'])
show_text_samples(test['text'], text)

Heard about #earthquake is different cities, stay safe everyone.
Heard about #earthquake is different cities, stay safe everyone.
Heard about #earthquake is different cities, stay safe everyone.
We're shaking...It's an earthquake
They'd probably still show more life than Arsenal did yesterday, eh? EH?
Hey! How are you?
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU
Birmingham Wholesale Market is ablaze BBC News - Fire breaks out at Birmingham's Wholesale Market http://t.co/irWqCEZWEU
@sunkxssedharry will you wear shorts for race ablaze ?
#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI
#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriage crisis sets Nigerian Twitter ablaze... http://t.co/CMghxBa2XI
#PreviouslyOnDoyinTv: Toke MakinwaÛªs ma

In [137]:
test['text'] = test['text'].apply(replace_text)

In [138]:
test['text_con_stemming'] = test['text'].apply(clean_text)

In [139]:
test['text_sin_stemming'] = test['text'].apply(regular_expressions)

In [140]:
cols = test.columns.tolist()
cols.remove('text')
cols.remove('keyword')
test = test[cols]

## Resultado

In [141]:
test

Unnamed: 0,id,location,keyword_sin_stemming,keyword_con_stemming,text_con_stemming,text_sin_stemming
0,0,,,,happen teribl car crash,just happened a terible car crash
1,2,,,,heard earthquak differ citi stay safe everyon,heard about earthquake is different cities sta...
2,3,,,,forest fire spot pond gees flee across street ...,there is a forest fire at spot pond geese are ...
3,9,,,,apocalyps light spokan wildfir,apocalypse lighting spokane wildfires
4,11,,,,typhoon soudelor kil number china taiwan,typhoon soudelor kils number in china and ta...
...,...,...,...,...,...,...
3258,10861,,,,earthquak safeti lo angel safeti fasten xrwn,earthquake safety los angeles safety fastene...
3259,10865,,,,storm ri wors last hurican cityand number othe...,storm in ri worse than last huricane my citya...
3260,10868,,,,green line derail chicago link,green line derailment in chicago link
3261,10874,,,,meg issu hazard weather outlook hwo link,meg issues hazardous weather outlook hwo link


## Guardado del dataframe

In [142]:
test.to_csv('test/test_limpio.csv', index=False)