In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import nltk
import string
import re
from os import path
from PIL import Image

from spellchecker import SpellChecker
from nltk import FreqDist
from collections import Counter
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer(strip_handles=True)



In [61]:
stop_words = stopwords.words('english')

In [62]:
df_train = pd.read_csv("dataset/train.csv")
df_test= pd.read_csv("dataset/test.csv")
pd.set_option('display.max_colwidth', -1) #elimino limite de truncado para visualizacion

  This is separate from the ipykernel package so we can avoid doing imports until


## Construccion de Features

In [63]:
# contador de palabras
df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split()))
df_test['word_count'] = df_test['text'].apply(lambda x: len(str(x).split()))

# palabras unicas
df_train['unique_word_count'] = df_train['text'].apply(lambda x: len(set(str(x).split())))
df_test['unique_word_count'] = df_test['text'].apply(lambda x: len(set(str(x).split())))

# contador de stopwords
df_train['stop_word_count'] = df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
df_test['stop_word_count'] = df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

# contador de url
df_train['url_count'] = df_train['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))
df_test['url_count'] = df_test['text'].apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

# extractor de emails
df_train['emails'] = df_train['text'].apply(lambda x: re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9+._-]+\.[a-zA-Z0-9+._-]+)', x))
df_test['emails'] = df_test['text'].apply(lambda x: re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9+._-]+\.[a-zA-Z0-9+._-]+)', x))

# contador de emails
df_train['emails_count'] = df_train['emails'].apply(lambda x: len(x))
df_test['emails_count'] = df_test['emails'].apply(lambda x: len(x))

# promedio del largo de palabras
df_train['mean_word_length'] = df_train['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_test['mean_word_length'] = df_test['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# contador de caracteres
df_train['char_count'] = df_train['text'].apply(lambda x: len(str(x)))
df_test['char_count'] = df_test['text'].apply(lambda x: len(str(x)))

# contador de numeros
df_train['number_count'] = df_train['text'].apply(lambda x: len([w for w in x.split() if w.isdigit()]))
df_test['number_count'] = df_test['text'].apply(lambda x: len([w for w in x.split() if w.isdigit()]))

# contador de mayuscuslas en palabras 
df_train['upper_count'] = df_train['text'].apply(lambda x: len([w for w in x.split() if w.isupper() and len(x)>3]))
df_test['upper_count'] = df_test['text'].apply(lambda x: len([w for w in x.split() if w.isupper() and len(x)>3]))

# contador de puntuaciones
df_train['punctuation_count'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
df_test['punctuation_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

# contador de hashtags
df_train['hashtag_count'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c == '#']))
df_test['hashtag_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '#']))

# contador de menciones
df_train['mention_count'] = df_train['text'].apply(lambda x: len([c for c in str(x) if c == '@']))
df_test['mention_count'] = df_test['text'].apply(lambda x: len([c for c in str(x) if c == '@']))

## Limpieza del Texto

In [None]:
from pycontractions import Contractions
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

2020-08-03 23:53:44,562 : INFO : Creating /home/pablo/gensim-data




2020-08-04 00:08:29,615 : INFO : word2vec-google-news-300 downloaded
2020-08-04 00:08:29,701 : INFO : loading projection weights from /home/pablo/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [64]:
# con el fin de agilizar el tiempo para la construccion de algunos filtros se han recolectado de diferentes fuentes

contracciones = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "i'd": "i would",
  "i'd've": "i would have",
  "i'll": "i will",
  "i'll've": "I will have",
  "i'm": "i am",
  "i've": "i have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}
    
def convertir_contracciones(text):
    t=[]
    words=text.split()
    t = [contracciones[w.lower()] if w.lower() in contracciones.keys() else w for w in words]
    return ' '.join(t) 

df_train['text']=df_train['text'].apply(convertir_contracciones)
df_test['text']=df_test['text'].apply(convertir_contracciones)    



def limpieza(text):
            
    # elimino caracteres especiales
    text = re.sub(r"\x89Û_", "", text)
    text = re.sub(r"\x89ÛÒ", "", text)
    text = re.sub(r"\x89ÛÓ", "", text)
    text = re.sub(r"\x89ÛÏWhen", "When", text)
    text = re.sub(r"\x89ÛÏ", "", text)
    text = re.sub(r"China\x89Ûªs", "China's", text)
    text = re.sub(r"let\x89Ûªs", "let's", text)
    text = re.sub(r"\x89Û÷", "", text)
    text = re.sub(r"\x89Ûª", "", text)
    text = re.sub(r"\x89Û\x9d", "", text)
    text = re.sub(r"å_", "", text)
    text = re.sub(r"\x89Û¢", "", text)
    text = re.sub(r"\x89Û¢åÊ", "", text)
    text = re.sub(r"fromåÊwounds", "from wounds", text)
    text = re.sub(r"åÊ", "", text)
    text = re.sub(r"åÈ", "", text)
    text = re.sub(r"JapÌ_n", "Japan", text)    
    text = re.sub(r"Ì©", "e", text)
    text = re.sub(r"å¨", "", text)
    text = re.sub(r"SuruÌ¤", "Suruc", text)
    text = re.sub(r"åÇ", "", text)
    text = re.sub(r"å£3million", "3 million", text)
    text = re.sub(r"åÀ", "", text)
    
    # Reemplazo contracciones
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"We're", "We are", text)
    text = re.sub(r"That's", "That is", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"they're", "they are", text)
    text = re.sub(r"Can't", "Cannot", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"don\x89Ûªt", "do not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"What's", "What is", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r"There's", "There is", text)
    text = re.sub(r"He's", "He is", text)
    text = re.sub(r"It's", "It is", text)
    text = re.sub(r"You're", "You are", text)
    text = re.sub(r"I'M", "I am", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"i'm", "I am", text)
    text = re.sub(r"I\x89Ûªm", "I am", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r"Isn't", "is not", text)
    text = re.sub(r"Here's", "Here is", text)
    text = re.sub(r"you've", "you have", text)
    text = re.sub(r"you\x89Ûªve", "you have", text)
    text = re.sub(r"we're", "we are", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"we've", "we have", text)
    text = re.sub(r"it\x89Ûªs", "it is", text)
    text = re.sub(r"doesn\x89Ûªt", "does not", text)
    text = re.sub(r"It\x89Ûªs", "It is", text)
    text = re.sub(r"Here\x89Ûªs", "Here is", text)
    text = re.sub(r"who's", "who is", text)
    text = re.sub(r"I\x89Ûªve", "I have", text)
    text = re.sub(r"y'all", "you all", text)
    text = re.sub(r"can\x89Ûªt", "cannot", text)
    text = re.sub(r"would've", "would have", text)
    text = re.sub(r"it'll", "it will", text)
    text = re.sub(r"we'll", "we will", text)
    text = re.sub(r"wouldn\x89Ûªt", "would not", text)
    text = re.sub(r"We've", "We have", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"Y'all", "You all", text)
    text = re.sub(r"Weren't", "Were not", text)
    text = re.sub(r"Didn't", "Did not", text)
    text = re.sub(r"they'll", "they will", text)
    text = re.sub(r"they'd", "they would", text)
    text = re.sub(r"DON'T", "DO NOT", text)
    text = re.sub(r"That\x89Ûªs", "That is", text)
    text = re.sub(r"they've", "they have", text)
    text = re.sub(r"i'd", "I would", text)
    text = re.sub(r"should've", "should have", text)
    text = re.sub(r"You\x89Ûªre", "You are", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"Don\x89Ûªt", "Do not", text)
    text = re.sub(r"we'd", "we would", text)
    text = re.sub(r"i'll", "I will", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"They're", "They are", text)
    text = re.sub(r"Can\x89Ûªt", "Cannot", text)
    text = re.sub(r"you\x89Ûªll", "you will", text)
    text = re.sub(r"I\x89Ûªd", "I would", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"you're", "you are", text)
    text = re.sub(r"i've", "I have", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"i'll", "I will", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"i'd", "I would", text)
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"ain't", "am not", text)
    text = re.sub(r"you'll", "you will", text)
    text = re.sub(r"I've", "I have", text)
    text = re.sub(r"Don't", "do not", text)
    text = re.sub(r"I'll", "I will", text)
    text = re.sub(r"I'd", "I would", text)
    text = re.sub(r"Let's", "Let us", text)
    text = re.sub(r"you'd", "You would", text)
    text = re.sub(r"It's", "It is", text)
    text = re.sub(r"Ain't", "am not", text)
    text = re.sub(r"Haven't", "Have not", text)
    text = re.sub(r"Could've", "Could have", text)
    text = re.sub(r"youve", "you have", text)  
    text = re.sub(r"donå«t", "do not", text)    
    
    return text


df_train['text']=df_train['text'].apply(lambda x: limpieza(x))
df_test['text']=df_test['text'].apply(lambda x: limpieza(x))


In [73]:
df_train.loc[500:510]

Unnamed: 0,id,keyword,location,text,target,word_count,unique_word_count,stop_word_count,url_count,emails,emails_count,mean_word_length,char_count,number_count,upper_count,punctuation_count,hashtag_count,mention_count
500,725,attacked,"LEALMAN, FLORIDA",Christian Attacked by Muslims at the Temple Mount after Waving Israeli Flag via Pamela Geller - ... http://t.co/LHBZHWq4B9,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
501,726,attacked,"Los Angeles, CA",@envw98 @NickCoCoFree @JulieDiCaro @jdabe80 Why am I the worst person? Questioning how julie attacked him? Do you guys have no empathy?,0,21,21,9,0,[],0,5.47619,136,0,1,7,0,4
502,727,attacked,"San Francisco, CA",Kelly Osbourne attacked for racist Donald Trump remark about Latinos on The View http://t.co/7nAgdSAdWP,1,14,14,4,1,[],0,6.428571,103,0,0,5,0,0
503,728,attacked,#GDJB #ASOT,@eunice_njoki aiii she needs to chill and answer calmly its not like she is being attacked,0,15,15,8,0,[],0,5.0,89,0,0,3,0,1
504,729,attacked,"Groningen, Netherlands, Europe",Christian Attacked by Muslims at the Temple Mount after Waving Israeli Flag via Pamela Geller - ... http://t.co/mXZ7yX8ld1,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
505,730,attacked,"Livingston, IL U.S.A.",Christian Attacked by Muslims at the Temple Mount after Waving Israeli Flag via Pamela Geller - ... http://t.co/e4YDbM4Dx6,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
506,731,attacked,Arundel,Christian Attacked by Muslims at the Temple Mount after Waving Israeli Flag via Pamela Geller - ... http://t.co/T1aa5Ov7Eg,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
507,732,attacked,,I attacked Robot-lvl 19 and i have earned a total of 6615434 free satoshis! http://t.co/DMLJ1aGoTw #robotcoingame #Bitcoin #FreeBitcoin,0,17,17,5,1,[],0,6.882353,133,2,1,11,3,0
508,734,attacked,America,Christian Attacked by Muslims at the Temple Mount after Waving Israeli Flag via Pamela Geller - ... http://t.co/EMDJNNltP0,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
509,735,attacked,"Anna Maria, FL",@christinalavv @lindsay_wynn3 I just saw these tweets and I feel really attacked,0,12,11,5,0,[],0,5.75,80,0,2,3,0,2


In [74]:
def eliminacion(text):
    # elimino Retweet RT
    text = re.sub(r'RT', "", text)
    
    # elimino urls
    text = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", text)
    
    # elimino puntuaciones
    text = re.sub('[^A-Z a-z 0-9-]+', '', text)
    
    # elimino espacios dobles
    text = re.sub(r"\s+", " ", text)
    
    # convierto a minisculas
    text = text.lower()
    
    
    '''
    # reemplazo caracteres atipicos
    text = re.sub(r"&gt;", " ", text)
    text = re.sub(r"&lt;", " ", text)
    text = re.sub(r"&amp;", " ", text)
    
    # elimino palabras contenidas en []
    text = re.sub('\[.*?\]', '', text)
    
    #Elimino las menciones, hashtag, urls.
    text = re.sub("@|&|#|https?\://\S+|www\.\S+", "", text)
    
    text = re.sub('<.*?>+', '', text)
    
    
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    
    #Elimino numeros y palabras que contengan numeros.
    text = re.sub('\w*\d\w*', '', text)
    
    '''
    
    return text


df_train['text']=df_train['text'].apply(lambda x: eliminacion(x))
df_test['text']=df_test['text'].apply(lambda x: eliminacion(x))

df_train.loc[500:510]

Unnamed: 0,id,keyword,location,text,target,word_count,unique_word_count,stop_word_count,url_count,emails,emails_count,mean_word_length,char_count,number_count,upper_count,punctuation_count,hashtag_count,mention_count
500,725,attacked,"LEALMAN, FLORIDA",christian attacked by muslims at the temple mount after waving israeli flag via pamela geller -,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
501,726,attacked,"Los Angeles, CA",envw98 nickcocofree juliedicaro jdabe80 why am i the worst person questioning how julie attacked him do you guys have no empathy,0,21,21,9,0,[],0,5.47619,136,0,1,7,0,4
502,727,attacked,"San Francisco, CA",kelly osbourne attacked for racist donald trump remark about latinos on the view,1,14,14,4,1,[],0,6.428571,103,0,0,5,0,0
503,728,attacked,#GDJB #ASOT,eunicenjoki aiii she needs to chill and answer calmly its not like she is being attacked,0,15,15,8,0,[],0,5.0,89,0,0,3,0,1
504,729,attacked,"Groningen, Netherlands, Europe",christian attacked by muslims at the temple mount after waving israeli flag via pamela geller -,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
505,730,attacked,"Livingston, IL U.S.A.",christian attacked by muslims at the temple mount after waving israeli flag via pamela geller -,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
506,731,attacked,Arundel,christian attacked by muslims at the temple mount after waving israeli flag via pamela geller -,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
507,732,attacked,,i attacked robot-lvl 19 and i have earned a total of 6615434 free satoshis robotcoingame bitcoin freebitcoin,0,17,17,5,1,[],0,6.882353,133,2,1,11,3,0
508,734,attacked,America,christian attacked by muslims at the temple mount after waving israeli flag via pamela geller -,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
509,735,attacked,"Anna Maria, FL",christinalavv lindsaywynn3 i just saw these tweets and i feel really attacked,0,12,11,5,0,[],0,5.75,80,0,2,3,0,2


In [77]:
def tokenizacion(text):
    
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokenizado = tokenizer.tokenize(text)
    sin_stopwords = [w for w in tokenizado if w not in stop_words]
    combinado = ' '.join(sin_stopwords)
    
    return combinado
    
df_train['text']=df_train['text'].apply(lambda x: tokenizacion(x))
df_test['text']=df_test['text'].apply(lambda x: tokenizacion(x))

In [78]:
df_train.loc[500:510]

Unnamed: 0,id,keyword,location,text,target,word_count,unique_word_count,stop_word_count,url_count,emails,emails_count,mean_word_length,char_count,number_count,upper_count,punctuation_count,hashtag_count,mention_count
500,725,attacked,"LEALMAN, FLORIDA",christian attacked muslims temple mount waving israeli flag via pamela geller,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
501,726,attacked,"Los Angeles, CA",envw98 nickcocofree juliedicaro jdabe80 worst person questioning julie attacked guys empathy,0,21,21,9,0,[],0,5.47619,136,0,1,7,0,4
502,727,attacked,"San Francisco, CA",kelly osbourne attacked racist donald trump remark latinos view,1,14,14,4,1,[],0,6.428571,103,0,0,5,0,0
503,728,attacked,#GDJB #ASOT,eunicenjoki aiii needs chill answer calmly like attacked,0,15,15,8,0,[],0,5.0,89,0,0,3,0,1
504,729,attacked,"Groningen, Netherlands, Europe",christian attacked muslims temple mount waving israeli flag via pamela geller,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
505,730,attacked,"Livingston, IL U.S.A.",christian attacked muslims temple mount waving israeli flag via pamela geller,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
506,731,attacked,Arundel,christian attacked muslims temple mount waving israeli flag via pamela geller,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
507,732,attacked,,attacked robot lvl 19 earned total 6615434 free satoshis robotcoingame bitcoin freebitcoin,0,17,17,5,1,[],0,6.882353,133,2,1,11,3,0
508,734,attacked,America,christian attacked muslims temple mount waving israeli flag via pamela geller,1,18,18,4,1,[],0,5.833333,122,0,0,9,0,0
509,735,attacked,"Anna Maria, FL",christinalavv lindsaywynn3 saw tweets feel really attacked,0,12,11,5,0,[],0,5.75,80,0,2,3,0,2


## Buscamos palabras comunes

In [8]:
palabrastrain = ' '.join(df_train['text'])
palabrastrain = palabrastrain.split()

palabrastest = ' '.join(df_test['text'])
palabrastest = palabrastest.split()

frecuenciastrain = pd.Series(palabrastrain).value_counts()
frecuenciastest = pd.Series(palabrastest).value_counts()

display(frecuenciastrain.head(50))
display(frecuenciastest.head(50))



-             779
like          345
amp           298
fire          251
get           229
new           226
via           220
people        196
news          194
one           193
us            187
would         175
video         165
2             159
emergency     157
disaster      153
police        141
still         129
body          125
burning       120
back          119
crash         119
california    117
storm         117
suicide       116
time          113
know          112
got           112
buildings     110
man           110
day           109
first         108
see           105
bomb          104
world         103
going         103
nuclear       102
love          101
cannot        99 
attack        99 
fires         99 
3             98 
youtube       98 
two           97 
killed        96 
dead          96 
go            96 
train         93 
full          92 
war           91 
dtype: int64

-            306
like         145
amp          136
fire         107
get          106
via          104
new          102
would        94 
news         89 
one          84 
people       83 
us           81 
2            74 
emergency    68 
attack       63 
disaster     61 
first        60 
video        60 
suicide      57 
police       56 
fires        52 
3            51 
still        50 
watch        50 
storm        49 
time         49 
hiroshima    49 
full         49 
crash        48 
burning      48 
got          48 
two          47 
going        46 
today        45 
body         44 
see          44 
love         44 
know         43 
make         43 
last         43 
go           43 
think        42 
life         41 
1            40 
buildings    40 
nuclear      40 
world        40 
cannot       39 
forest       39 
bomb         39 
dtype: int64

## Intersectamos palabras comunes de ambos target

In [9]:
#separo por grupos
df_real = df_train[df_train['target']==1]
df_falso = df_train[df_train['target']==0]

palabrasreal = ' '.join(df_real['text'])
palabrasreal = palabrasreal.split()

palabrasfalso = ' '.join(df_falso['text'])
palabrasfalso = palabrasfalso.split()

real35 = pd.Series(palabrasreal).value_counts()[:35]
falso35 = pd.Series(palabrasfalso).value_counts()[:35]

real35 = real35.keys().tolist()
falso35 = falso35.keys().tolist()

elementos_comunes_tipicos = list(set(real35).intersection(set(falso35)))
elementos_comunes_tipicos


['us',
 'like',
 'emergency',
 'get',
 '2',
 'via',
 'one',
 'fire',
 'people',
 '-',
 'video',
 'amp']

## Eliminamos estas palabras de ambos sets

In [10]:
# elimino palabras comunes
def palabras_comunes(text):
    text = " ".join([w for w in text.split() if w not in elementos_comunes_tipicos])
    return text
    
df_train['text']=df_train['text'].apply(lambda x: palabras_comunes(x))
df_test['text']=df_test['text'].apply(lambda x: palabras_comunes(x))


In [11]:
palabrastrain = ' '.join(df_train['text'])
palabrastrain = palabrastrain.split()

palabrastest = ' '.join(df_test['text'])
palabrastest = palabrastest.split()

frecuenciastrain = pd.Series(palabrastrain).value_counts()
frecuenciastest = pd.Series(palabrastest).value_counts()

display(frecuenciastrain.head(50))
display(frecuenciastest.head(50))

new           226
news          194
would         175
disaster      153
police        141
still         129
body          125
burning       120
back          119
crash         119
california    117
storm         117
suicide       116
time          113
got           112
know          112
buildings     110
man           110
day           109
first         108
see           105
bomb          104
going         103
world         103
nuclear       102
love          101
fires         99 
cannot        99 
attack        99 
youtube       98 
3             98 
two           97 
dead          96 
go            96 
killed        96 
train         93 
full          92 
war           91 
car           90 
good          89 
may           88 
could         88 
families      88 
today         87 
hiroshima     87 
accident      87 
life          87 
think         86 
say           86 
let           86 
dtype: int64

new           102
would         94 
news          89 
attack        63 
disaster      61 
first         60 
suicide       57 
police        56 
fires         52 
3             51 
still         50 
watch         50 
time          49 
storm         49 
hiroshima     49 
full          49 
crash         48 
got           48 
burning       48 
two           47 
going         46 
today         45 
body          44 
love          44 
see           44 
know          43 
last          43 
go            43 
make          43 
think         42 
life          41 
nuclear       40 
world         40 
1             40 
buildings     40 
could         39 
back          39 
need          39 
even          39 
dead          39 
forest        39 
cannot        39 
bomb          39 
right         38 
youtube       38 
bombing       38 
california    37 
car           37 
day           36 
wild          35 
dtype: int64

In [64]:
##ver palabras raras###

In [79]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV


In [103]:
# entrenamiento simple recuento

count_vectorizer = CountVectorizer(min_df=2 ,max_df=0.9, ngram_range=(1, 2))
train_vectors = count_vectorizer.fit_transform(df_train['text'])
test_vectors = count_vectorizer.transform(df_test['text'])

In [104]:
# Fitting a simple Naive Bayes on Counts
clf_NB = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB, train_vectors, df_train["target"], cv=5, scoring="f1")
scores

array([0.59000943, 0.57928803, 0.63225806, 0.61417323, 0.74475777])

In [87]:
# modelo Logistic Regression recuento
clf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf, train_vectors, df_train['target'], cv=5, scoring="f1")
scores

#v2 array([0.59827421, 0.48601399, 0.55480607, 0.49665712, 0.68870968]) mejor
# con limpieza 2 array([0.58981748, 0.50902837, 0.55148342, 0.51643192, 0.68803753])
# sin limpieza array([0.60114504, 0.48378615, 0.5655058 , 0.51943128, 0.6963434 ])

# probar ngram(1,1)
#array([0.60338346, 0.49913644, 0.55980066, 0.51180359, 0.68970935])

array([0.60338346, 0.49913644, 0.55980066, 0.51180359, 0.68970935])

In [90]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 1))
tfidf_train = tfidf.fit_transform(df_train['text'])

In [91]:
# entreno con Logistic Regresion y TF-IDF
clf_tfidf = LogisticRegression(C=1.0)
scores = model_selection.cross_val_score(clf_tfidf, tfidf_train, df_train['target'], cv=5, scoring="f1")
scores

# con limpieza 2 array([0.54845361, 0.463138  , 0.5093046 , 0.47167868, 0.66079295])
# sin limpieza array([0.56880734, 0.48210923, 0.5369863 , 0.45168067, 0.66311111])
# ngramas 1,1 array([0.58541459, 0.50686185, 0.5419708 , 0.50901804, 0.69296741])

array([0.58541459, 0.50686185, 0.5419708 , 0.50901804, 0.69296741])

In [92]:
# Fitting a simple Naive Bayes on Counts
clf_NB = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB, train_vectors, df_train["target"], cv=5, scoring="f1")
scores

# probar array([0.61818182, 0.60692771, 0.67601476, 0.63355049, 0.7303989 ])

array([0.61818182, 0.60692771, 0.67601476, 0.63355049, 0.7303989 ])

In [93]:
# Fitting a simple Naive Bayes on TFIDF
clf_NB_TFIDF = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB_TFIDF, tfidf_train, df_train["target"], cv=5, scoring="f1")
scores

# sin limpieza array([0.56573705, 0.57193606, 0.59090909, 0.582397  , 0.73968254])
# ngramas 1,1 array([0.60528715, 0.61387755, 0.64395783, 0.61028771, 0.75353686])


array([0.60528715, 0.61387755, 0.64395783, 0.61028771, 0.75353686])

In [7]:

#v1 array([0.57471264, 0.47579299, 0.57738573, 0.54561879, 0.68725869])
#v2 array([0.59827421, 0.48601399, 0.55480607, 0.49665712, 0.68870968])

#array([0.62846227, 0.53043478, 0.61359867, 0.56859206, 0.71221865])
#array([0.60731949, 0.48692516, 0.57931034, 0.522158  , 0.6980198 ])

array([0.5986654 , 0.47776809, 0.56213511, 0.50332384, 0.69446672])

## Modelo definitivo de esta version

In [98]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 1))

tfidf_train = tfidf.fit_transform(df_train['text'])
tfidf_test = tfidf.transform(df_test['text'])

In [99]:
clf_NB_TFIDF = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB_TFIDF, tfidf_train, df_train["target"], cv=5, scoring="f1")
scores

array([0.61818182, 0.60692771, 0.67601476, 0.63355049, 0.7303989 ])

array([0.60528715, 0.61387755, 0.64395783, 0.61028771, 0.75353686])

In [100]:
# entreno logistic regresion con tfidf con ngramas (1,1)

clf_NB_TFIDF.fit(tfidf_train, df_train["target"])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [101]:
# Submission
sample_submission = pd.read_csv("dataset/sample_submission.csv")
sample_submission['target'] = clf_NB_TFIDF.predict(tfidf_test)
sample_submission.to_csv("submission8.csv", index=False)

In [58]:
## lo que falto 

In [25]:
import spacy
import gensim
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [27]:
import gc
del wv
gc.collect()

0

In [28]:
from tqdm import tqdm

In [29]:
def crear_corpus(df):
    corpus=[]
    for tweet in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop_words))]
        corpus.append(words)
    return corpus

In [31]:
corpus=crear_corpus(df_train)

100%|██████████| 7613/7613 [00:02<00:00, 2682.52it/s]


In [32]:
corpus

[['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive'],
 ['forest', 'near', 'la', 'ronge', 'sask', 'canada'],
 ['residents',
  'asked',
  'shelter',
  'place',
  'notified',
  'officers',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['receive', 'wildfires', 'evacuation', 'orders', 'california'],
 ['got',
  'sent',
  'photo',
  'ruby',
  'alaska',
  'smoke',
  'wildfires',
  'pours',
  'school'],
 ['rockyfire',
  'update',
  'california',
  'hwy',
  'closed',
  'directions',
  'due',
  'lake',
  'county',
  'cafire',
  'wildfires'],
 ['flood',
  'disaster',
  'heavy',
  'rain',
  'causes',
  'flash',
  'flooding',
  'streets',
  'manitou',
  'colorado',
  'springs',
  'areas'],
 ['top', 'hill', 'see', 'woods'],
 ['evacuation', 'happening', 'building', 'across', 'street'],
 ['afraid', 'tornado', 'coming', 'area'],
 ['three', 'died', 'heat', 'wave', 'far'],
 ['haha',
  'south',
  'tampa',
  'getting',
  'flooded',
  'wait',
  'second',
  'live',
  'south',
  

In [None]:
for i, word in enumerate(wv.vocab):
    if i == 10:
        break
    print(word)

In [None]:
## otra cosa

In [7]:
#separo por grupos
dfreal = df[df['target']==1]
dffalso = df[df['target']==0]

dfreal['words']

0       [deeds, reason, earthquake, may, allah, forgive, us]                                                                            
1       [forest, fire, near, la, ronge, sask, canada]                                                                                   
2       [residents, asked, shelter, place, notified, officers, evacuation, shelter, place, orders, expected]                            
3       [people, receive, wildfires, evacuation, orders, california]                                                                    
4       [got, sent, photo, ruby, alaska, smoke, wildfires, pours, school]                                                               
                                      ...                                                                                               
7608    [two, giant, cranes, holding, bridge, collapse, nearby, homes]                                                                  
7609    [ariaahrary, thetawniest, control

In [22]:
lista = df['words'].tolist()
lista = sum(lista, [])
lista

['deeds',
 'reason',
 'earthquake',
 'may',
 'allah',
 'forgive',
 'us',
 'forest',
 'fire',
 'near',
 'la',
 'ronge',
 'sask',
 'canada',
 'residents',
 'asked',
 'shelter',
 'place',
 'notified',
 'officers',
 'evacuation',
 'shelter',
 'place',
 'orders',
 'expected',
 'people',
 'receive',
 'wildfires',
 'evacuation',
 'orders',
 'california',
 'got',
 'sent',
 'photo',
 'ruby',
 'alaska',
 'smoke',
 'wildfires',
 'pours',
 'school',
 'rockyfire',
 'update',
 'california',
 'hwy',
 'closed',
 'directions',
 'due',
 'lake',
 'county',
 'fire',
 'cafire',
 'wildfires',
 'flood',
 'disaster',
 'heavy',
 'rain',
 'causes',
 'flash',
 'flooding',
 'streets',
 'manitou',
 'colorado',
 'springs',
 'areas',
 'top',
 'hill',
 'see',
 'fire',
 'woods',
 'theres',
 'emergency',
 'evacuation',
 'happening',
 'building',
 'across',
 'street',
 'afraid',
 'tornado',
 'coming',
 'area',
 'three',
 'people',
 'died',
 'heat',
 'wave',
 'far',
 'haha',
 'south',
 'tampa',
 'getting',
 'flooded',
 '

In [24]:
listaset = set(lista)
listaset

{'bore',
 'listeria',
 'cafe',
 'platt',
 'ampor',
 'dublin',
 'dismisses',
 'saintrobinho86',
 'gum',
 'sneaks',
 'paranormal',
 'declared',
 'ruins',
 'shipwreck',
 'hopped',
 'preconditioning',
 'fuckfacewineisdumbcom',
 'prepared',
 'cityamp3others',
 'gregory',
 'atomicbomb',
 'contries',
 'ep016',
 'sweater',
 'aerospace',
 'walker',
 'biterelated',
 'cue',
 'bluedio',
 'thomassmonson',
 'wbcshirl2',
 'beth',
 'tonight',
 'pilloried',
 'metastatic',
 'sterlingknight',
 'ethical',
 'tedcruz2016',
 'withstand',
 'excessive',
 'blitzes',
 'butt',
 'bbcnews',
 '\x89û÷british',
 'avengers',
 'foodstand',
 'dye',
 'aesop',
 'humidity',
 'fx',
 'humboldt',
 'reason',
 '11st',
 'content',
 'throwback',
 'mean',
 'promoted',
 'firearm',
 'scandals',
 'christinalavv',
 'sundays',
 'bedroom',
 'obsolete',
 'transgender',
 'recently',
 'to\x89û',
 'bundled',
 'peale',
 'driveby',
 'postering',
 'bloggers',
 'motivator',
 'puppyshogun',
 'mental',
 'hug',
 'diarrhea',
 'homie',
 'pci',
 'grow

In [59]:
def creo_list_palabras(target):
    lista_palabras=[]
    
    for x in df[df['target']==target]['words']:
        for i in x:
            lista_palabras.append(i)
    return lista_palabras

# Dada una lista de palabras, devuelve un diccionario de
# pares de palabra-frecuencia.

def listaPalabrasDicFrec(listaPalabras):
    frecuenciaPalab = [listaPalabras.count(p) for p in listaPalabras]
    return dict(list(zip(listaPalabras,frecuenciaPalab)))

# Ordena un diccionario de pares palabra-frecuencia en
# orden de frecuencia descendente.

def ordenaDicFrec(dicfrec):
    aux = [(dicfrec[key], key) for key in dicfrec]
    aux.sort()
    aux.reverse()
    return aux

lista_palabras_reales = creo_list_palabras(0)
dic_frec_reales = ordenaDicFrec(listaPalabrasDicFrec(lista_palabras_reales))
dic_frec_reales

#lista_palabras_reales = creo_list_palabras(1)


[(253, 'like'),
 (168, 'new'),
 (112, 'body'),
 (99, 'via'),
 (91, 'people'),
 (90, 'video'),
 (89, 'love'),
 (85, 'know'),
 (84, 'back'),
 (83, 'us'),
 (83, 'time'),
 (83, 'got'),
 (82, 'see'),
 (81, 'full'),
 (81, 'emergency'),
 (81, 'cant'),
 (78, 'day'),
 (76, 'youtube'),
 (75, 'going'),
 (72, 'still'),
 (72, 'fire'),
 (67, 'want'),
 (67, 'good'),
 (65, 'think'),
 (64, '3'),
 (62, 'world'),
 (61, 'man'),
 (61, 'lol'),
 (60, 'rt'),
 (60, 'life'),
 (59, 'u'),
 (58, 'youre'),
 (58, 'first'),
 (56, 'news'),
 (56, 'last'),
 (55, 'way'),
 (55, 'really'),
 (55, 'need'),
 (55, 'burning'),
 (54, 'work'),
 (54, 'make'),
 (53, 'best'),
 (52, 'let'),
 (51, 'much'),
 (51, 'many'),
 (51, 'even'),
 (50, 'take'),
 (48, 'say'),
 (48, 'help'),
 (47, 'wreck'),
 (47, 'great'),
 (46, 'right'),
 (46, 'feel'),
 (46, 'content'),
 (45, 'hot'),
 (44, 'please'),
 (44, 'never'),
 (44, 'look'),
 (44, 'god'),
 (44, 'fear'),
 (44, 'every'),
 (43, 'could'),
 (43, '5'),
 (42, 'read'),
 (42, 'cross'),
 (42, 'bags')

In [14]:
#corrector de palabras
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

df['text']=df['text'].apply(lambda x : correct_spellings(x))


'eunice joke iii she needs to chill and answer calmly its not like she s being attacked'