In [22]:
## IMPORTACIÓN GENERAL DE LIBRERIAS.
import re 
import sys
import json
import nltk
import geocoder
import requests
import warnings
import descartes

import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import datetime as DT
#import geopandas as gpd
import matplotlib.pyplot as plt

# SKLEARN.
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# RANDOM FOREST.
from urllib.request import urlopen
#from shapely.geometry import Point, Polygon
from sklearn.ensemble import RandomForestRegressor

# XGBOOST.
from xgboost import XGBClassifier
import xgboost as xgb

In [23]:
#NLTK: https://www.nltk.org
from langdetect import detect
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

#WORDCLOUD
from PIL import Image
from wordcloud import WordCloud

# CONFIGURACIÓN.
%matplotlib inline
plt.style.use('default')
pd.options.display.float_format = '{:20,.2f}'.format
warnings.filterwarnings('ignore')
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)
np.set_printoptions(threshold=sys.maxsize)

In [24]:
#Importacion del archivo CSV de fuente
#https://www.kaggle.com/c/nlp-getting-started
original_train = pd.read_csv('data/train.csv')
original_test = pd.read_csv('data/test.csv')
original_sample_submission = pd.read_csv('data/sample_submission.csv')

In [25]:
#PASAMOS TODO A MINÚSCULAS (TRAIN).
original_train['text'] = original_train['text'].str.lower()
original_train['location'] = original_train['location'].str.lower()
original_train['keyword'] = original_train['keyword'].str.lower()
#PASAMOS TODO A MINÚSCULAS (TEST).
original_test['text'] = original_test['text'].str.lower()
original_test['location'] = original_test['location'].str.lower()
original_test['keyword'] = original_test['keyword'].str.lower()

In [26]:
#LONGITUD DEL TWEET Y CANTIDAD DE PALABRAS (TRAIN).
original_train['length'] = original_train['text'].str.len()
original_train['totalwords'] = original_train['text'].str.split().str.len()
original_train['words'] = original_train.text.str.strip().str.split()
#LONGITUD DEL TWEET Y CANTIDAD DE PALABRAS (TEST).
original_test['length'] = original_test['text'].str.len()
original_test['totalwords'] = original_test['text'].str.split().str.len()
original_test['words'] = original_test.text.str.strip().str.split()

In [27]:
#BUSCAMOS LAS COINCIDENCIAS CON PALABRAS CLAVES Y SEPARAMOS LOS HASHTAGS (TRAIN).
original_train = original_train.assign(hashtags=[filtrarPalabras(el) for el in original_train.words])
#BUSCAMOS LAS COINCIDENCIAS CON PALABRAS CLAVES Y SEPARAMOS LOS HASHTAGS (TEST).
original_test = original_test.assign(hashtags=[filtrarPalabras(el) for el in original_test.words])

In [28]:
#CONTAMOS LA CANTIDAD DE HASHTAGS (TRAIN).
original_train['hashtagsCantidad'] = original_train['hashtags'].str.count('#')
original_train['preguntas'] = original_train['text'].str.count('[!]') + original_train['text'].str.count('[?]')
original_train['simbolos'] = original_train['text'].str.count('[!]') + original_train['text'].str.count('[?]') + original_train['text'].str.count('=') + original_train['text'].str.count('>')
#CONTAMOS LA CANTIDAD DE HASHTAGS (TEST).
original_test['hashtagsCantidad'] = original_test['hashtags'].str.count('#')
original_test['preguntas'] = original_test['text'].str.count('[!]') + original_test['text'].str.count('[?]')
original_test['simbolos'] = original_test['text'].str.count('[!]') + original_test['text'].str.count('[?]') + original_test['text'].str.count('=') + original_test['text'].str.count('>')

In [29]:
original_train.head()

Unnamed: 0,id,keyword,location,text,target,length,totalwords,words,hashtags,hashtagsCantidad,preguntas,simbolos
0,1,,,our deeds are the reason of this #earthquake m...,1,69,13,"[our, deeds, are, the, reason, of, this, #eart...",#earthquake,1,0,0
1,4,,,forest fire near la ronge sask. canada,1,38,7,"[forest, fire, near, la, ronge, sask., canada]",,0,0,0
2,5,,,all residents asked to 'shelter in place' are ...,1,133,22,"[all, residents, asked, to, 'shelter, in, plac...",,0,0,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,65,8,"[13,000, people, receive, #wildfires, evacuati...",#wildfires,1,0,0
4,7,,,just got sent this photo from ruby #alaska as ...,1,88,16,"[just, got, sent, this, photo, from, ruby, #al...",#alaska #wildfires,2,0,0


In [30]:
#MARCAMOS QUIENES USAN UBICACIÓN Y PALABRAS CLAVES (TRAIN).
original_train['conUbicacion'] = 0
original_train['conKeyword'] = 0
original_train.loc[original_train['location'] != 'vacio', ['conUbicacion']] = 1
original_train.loc[original_train['keyword'] != 'vacio', ['conKeyword']] = 1
#MARCAMOS QUIENES USAN UBICACIÓN Y PALABRAS CLAVES (TEST).
original_test['conUbicacion'] = 0
original_test['conKeyword'] = 0
original_test.loc[original_test['location'] != 'vacio', ['conUbicacion']] = 1
original_test.loc[original_test['keyword'] != 'vacio', ['conKeyword']] = 1

In [32]:
#HOT ENCODING PARA KEYWORD (TRAIN).
dummies = pd.get_dummies(original_train['keyword'], drop_first=False)
original_train = pd.concat([original_train, dummies], axis=1)
original_train.drop('keyword', 1, inplace = True)
#HOT ENCODING PARA KEYWORD (TEST).
dummies = pd.get_dummies(original_test['keyword'], drop_first=False)
original_test = pd.concat([original_test, dummies], axis=1)
original_test.drop('keyword', 1, inplace = True)

In [34]:
#ELIMINAMOS LAS COLUMNAS QUE NO SON NUMÉRICAS (TRAIN).
original_train.drop('words', 1, inplace = True)
original_train.drop('location', 1, inplace = True)
original_train.drop('hashtags', 1, inplace = True)
#ELIMINAMOS LAS COLUMNAS QUE NO SON NUMÉRICAS (TEST).
original_test.drop('words', 1, inplace = True)
original_test.drop('location', 1, inplace = True)
original_test.drop('hashtags', 1, inplace = True)

In [35]:
original_train.head()

Unnamed: 0,id,text,target,length,totalwords,hashtagsCantidad,preguntas,simbolos,conUbicacion,conKeyword,...,weapons,whirlwind,wild%20fires,wildfire,windstorm,wounded,wounds,wreck,wreckage,wrecked
0,1,our deeds are the reason of this #earthquake m...,1,69,13,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,4,forest fire near la ronge sask. canada,1,38,7,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,5,all residents asked to 'shelter in place' are ...,1,133,22,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,6,"13,000 people receive #wildfires evacuation or...",1,65,8,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,7,just got sent this photo from ruby #alaska as ...,1,88,16,2,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [36]:
original_train.to_csv('data/processed/train_to_keras.csv',index=False)
original_test.to_csv('data/processed/test_to_keras.csv',index=False)