In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS.
import re 
import sys
import json
import nltk
import geocoder
import requests
import warnings
import descartes

import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import datetime as DT
#import geopandas as gpd
import matplotlib.pyplot as plt

# SKLEARN.
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# RANDOM FOREST.
from urllib.request import urlopen
from shapely.geometry import Point, Polygon
from sklearn.ensemble import RandomForestRegressor

# XGBOOST.
from xgboost import XGBClassifier
import xgboost as xgb

In [2]:
#NLTK: https://www.nltk.org
from langdetect import detect
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

#WORDCLOUD
from PIL import Image
from wordcloud import WordCloud

# CONFIGURACIÓN.
%matplotlib inline
plt.style.use('default')
pd.options.display.float_format = '{:20,.2f}'.format
warnings.filterwarnings('ignore')
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
np.set_printoptions(threshold=sys.maxsize)

In [3]:
#Limpia puntuacion, quita también usuarios y hashtags (@ y #)
def clean_tweet(words):
    result = []
    for word in words:
        stripped_word = word.strip()
        if ((stripped_word.isalnum() == True) and (not(stripped_word.isdigit()))):      
            result.append(stripped_word)
    
    return result

def filtrarPalabras(miArray):
    variable = ''
    for key in miArray:
         if '#' in str(key):
            variable = variable + ' ' + str(key)
    return variable

### =====================================================================
### LECTURA DE CSV.
### =====================================================================

In [4]:
#Importacion del archivo CSV de fuente
#https://www.kaggle.com/c/nlp-getting-started
original_train = pd.read_csv('data/train.csv')
original_test = pd.read_csv('data/test.csv')
original_sample_submission = pd.read_csv('data/sample_submission.csv')

### =====================================================================
### PROCESAMIENTO DE DATOS.
### =====================================================================

In [5]:
#COMPLETAMOS LOS NULOS CON UN TEXTO: 'VACIO'
original_train.fillna('vacio', inplace = True)
original_test.fillna('vacio', inplace = True)

In [6]:
#PASAMOS TODO A MINÚSCULAS (TRAIN).
original_train['text'] = original_train['text'].str.lower()
original_train['location'] = original_train['location'].str.lower()
original_train['keyword'] = original_train['keyword'].str.lower()
#PASAMOS TODO A MINÚSCULAS (TEST).
original_test['text'] = original_test['text'].str.lower()
original_test['location'] = original_test['location'].str.lower()
original_test['keyword'] = original_test['keyword'].str.lower()

In [7]:
#LONGITUD DEL TWEET Y CANTIDAD DE PALABRAS (TRAIN).
original_train['length'] = original_train['text'].str.len()
original_train['totalwords'] = original_train['text'].str.split().str.len()
original_train['words'] = original_train.text.str.strip().str.split()
#LONGITUD DEL TWEET Y CANTIDAD DE PALABRAS (TEST).
original_test['length'] = original_test['text'].str.len()
original_test['totalwords'] = original_test['text'].str.split().str.len()
original_test['words'] = original_test.text.str.strip().str.split()

In [8]:
#ARMANDO UNA LISTA DE PALABRAS QUE PUEDEN REPRESENTAR CATÁSTROFES (TRAIN & TEST).
selected_words_singular=['fire','flood','inundate','earthquake','quake','deluge','euption','twister','tornado','hurricane', 'landslide','typhoon','wildfire','forest fire','drought','avalanche','urgent','important','danger','warrning','evacuation']
selected_words_plural=['fires','floods', 'earthquakes','quakes','deluges','rashes','tornadoes','hurricanes', 'landslides','typhoons','wildfires','forest fires','droughts','avalanches']
selected_words_other=['heat wave','died','flooding','flooded','damage','urgent','important','danger','warrning','help','evacuation']
col_one_list = original_train['keyword'].tolist()
selected_words = selected_words_singular + selected_words_plural + selected_words_other + col_one_list
s = set(selected_words)

In [9]:
#BUSCAMOS LAS COINCIDENCIAS CON PALABRAS CLAVES Y SEPARAMOS LOS HASHTAGS (TRAIN).
original_train = original_train.assign(hashtags=[filtrarPalabras(el) for el in original_train.words])
original_train = original_train.assign(matches=[len(set(el) & s) for el in original_train.words])
#BUSCAMOS LAS COINCIDENCIAS CON PALABRAS CLAVES Y SEPARAMOS LOS HASHTAGS (TEST).
original_test = original_test.assign(hashtags=[filtrarPalabras(el) for el in original_test.words])
original_test = original_test.assign(matches=[len(set(el) & s) for el in original_test.words])

In [10]:
#CONTAMOS LA CANTIDAD DE HASHTAGS (TRAIN).
original_train['hashtagsCantidad'] = original_train['hashtags'].str.count('#')
original_train['preguntas'] = original_train['text'].str.count('[!]') + original_train['text'].str.count('[?]')
original_train['simbolos'] = original_train['text'].str.count('[!]') + original_train['text'].str.count('[?]') + original_train['text'].str.count('=') + original_train['text'].str.count('>')
#CONTAMOS LA CANTIDAD DE HASHTAGS (TEST).
original_test['hashtagsCantidad'] = original_test['hashtags'].str.count('#')
original_test['preguntas'] = original_test['text'].str.count('[!]') + original_test['text'].str.count('[?]')
original_test['simbolos'] = original_test['text'].str.count('[!]') + original_test['text'].str.count('[?]') + original_test['text'].str.count('=') + original_test['text'].str.count('>')

In [11]:
#MARCAMOS QUIENES USAN UBICACIÓN Y PALABRAS CLAVES (TRAIN).
original_train['conUbicacion'] = 0
original_train['conKeyword'] = 0
original_train.loc[original_train['location'] != 'vacio', ['conUbicacion']] = 1
original_train.loc[original_train['keyword'] != 'vacio', ['conKeyword']] = 1
#MARCAMOS QUIENES USAN UBICACIÓN Y PALABRAS CLAVES (TEST).
original_test['conUbicacion'] = 0
original_test['conKeyword'] = 0
original_test.loc[original_test['location'] != 'vacio', ['conUbicacion']] = 1
original_test.loc[original_test['keyword'] != 'vacio', ['conKeyword']] = 1

In [12]:
#HOT ENCODING PARA KEYWORD (TRAIN).
dummies = pd.get_dummies(original_train['keyword'], drop_first=False)
original_train = pd.concat([original_train, dummies], axis=1)
original_train.drop('keyword', 1, inplace = True)
#HOT ENCODING PARA KEYWORD (TEST).
dummies = pd.get_dummies(original_test['keyword'], drop_first=False)
original_test = pd.concat([original_test, dummies], axis=1)
original_test.drop('keyword', 1, inplace = True)

In [13]:
#ELIMINAMOS LAS COLUMNAS QUE NO SON NUMÉRICAS (TRAIN).
original_train.drop('text', 1, inplace = True)
original_train.drop('words', 1, inplace = True)
original_train.drop('location', 1, inplace = True)
original_train.drop('hashtags', 1, inplace = True)
#ELIMINAMOS LAS COLUMNAS QUE NO SON NUMÉRICAS (TEST).
original_test.drop('text', 1, inplace = True)
original_test.drop('words', 1, inplace = True)
original_test.drop('location', 1, inplace = True)
original_test.drop('hashtags', 1, inplace = True)

### =====================================================================
### RESGUARDAMOS LOS DATOS EN CSV.
### =====================================================================

In [14]:
#Vemos la estructura del dataframe TRAIN.
original_train.head()

Unnamed: 0,id,target,length,totalwords,matches,hashtagsCantidad,preguntas,simbolos,conUbicacion,conKeyword,...,weapons,whirlwind,wild%20fires,wildfire,windstorm,wounded,wounds,wreck,wreckage,wrecked
0,1,1,69,13,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,1,38,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,1,133,22,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,1,65,8,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,7,1,88,16,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
original_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Columns: 232 entries, id to wrecked
dtypes: int64(10), uint8(222)
memory usage: 2.2 MB


In [16]:
#Vemos la estructura del dataframe TEST.
original_test.head()

Unnamed: 0,id,length,totalwords,matches,hashtagsCantidad,preguntas,simbolos,conUbicacion,conKeyword,ablaze,...,weapons,whirlwind,wild%20fires,wildfire,windstorm,wounded,wounds,wreck,wreckage,wrecked
0,0,34,6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,64,9,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,96,19,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,40,4,1,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11,45,8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
#Vemos la estructura del dataframe SAMPLE_SUBMISSION.
original_sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [18]:
original_train.to_csv('data/processed/original_train.csv',index=False)
original_test.to_csv('data/processed/original_test.csv',index=False)

### =====================================================================
### FIN.
### =====================================================================