In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re
import nltk.corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import naive_bayes

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [71]:
targets = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [72]:
df = train
df['text'] = df['text'].str.lower()   # pasamos a lowercase
df = df.drop(['id', 'location'], axis=1)

In [74]:
df.sample(8)

Unnamed: 0,keyword,text,target
5959,screaming,harshness follows us a\nbetter day\nby sarah c\nracing thoughts with screaming sirens\npacing back and forth for... http://t.co/prontouo91,0
290,apocalypse,dad bought a dvd that looks like a science doc on the front but i read the back and it's actually about the impending biblical apocalypse,1
6533,survived,@thedailyshow mahalo nui loa for making my 20s. my generation could not have survived the (w.) bush years without you. #jonvoyage #holomua,0
2155,deaths,fco minister @tobias_ellwood condemns attack at a mosque in saudi arabia that has resulted in at least 15 deaths http://t.co/c3w95h0ozz,1
4560,injured,@welles_7 he was injured. he is a pro bowl back.,0
3899,flattened,@kainyusanagi @grummz @pixelcanuck and flattened raynor. raynor was a balding imperfect biker marine not a emo generic western hero.,0
5408,panicking,@beauscoven nah man he's panicking. he just found out his brothers had it off with his now wife debbie is in hospital he's stressed,0
2112,death,i had no issues uploading death to smoochy or awakenings clips to @youtube but for some reason bicentennial man is being a pain in the ass.,0


In [75]:
# generamos una columna que indica la cantidad de links a  enlaces externos
df['link'] = df['text'].apply(lambda x: x.count('http'))

# generamos una columna que indica la cantidad de referencias a otras cuentas de twitter
df['contact'] = df['text'].apply(lambda x: x.count('@'))

# generamos una columna que indica la cantidad de hashtags
df['hashtag'] = df['text'].apply(lambda x: x.count('#'))

# generamos una columna que indica la cantidad de digitos
df['numerics'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

# calculamos la longitud del tweet andes de limpiar
df['length'] = df['text'].str.len()

In [76]:
# calculamos la cantidad de palabras antes de limpiar
def count_words(text):
    '''
    Funcion que toma un texto y devuelve la cantidad de palabras
    '''
    word_counts = len(text.split(' '))
    return word_counts

df['words'] = df['text'].apply(count_words)

In [77]:
# limpiamos el texto eliminando urls, cuentas, hashtags y numeros

def clean_str(string):
    string = re.sub(r'https?\://\S+', '', string)
    string = re.sub(r'http?\://\S+', '', string)
    string = re.sub(r'@\w*\s', '', string)
    string = re.sub(r'#\w*\s', '', string)
    string = re.sub(r'\d', '', string)
    return string

df['text_clean'] = df['text'].apply(lambda x: clean_str(str(x)))

In [78]:
# eliminamos stopwords

stop = stopwords.words('english')

df['text_clean'] = df['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [79]:
# eliminamos signos de puntuacion y caracteres especiales
df['text_clean'] = df['text_clean'].str.replace('[^\w\s]','')

In [80]:
df['word_tokenize'] = df['text_clean'].apply(lambda x: word_tokenize(x))

In [81]:
def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text

In [82]:
# unificamos palabras que poseen la misma raiz aplicando la funcion word_lematizer
df['word_lemmatizer'] = df['word_tokenize'].apply(lambda x: word_lemmatizer(x))

In [83]:
# unificamos la lista de tokens para poder analizar el texto limpio
df['text_clean'] = df['word_lemmatizer'].str.join(' ')

In [84]:
# calculamos nuevamente la longuitud, pero ahora del texto limpio                       
df['length-clean'] = df['text_clean'].str.len()

In [85]:
df['words_clean'] = df['text_clean'].apply(count_words)

In [86]:
pd.set_option('display.max_colwidth', 150)

In [87]:
df['keyword'] = df.keyword.str.replace('%20', '_')
df['keyword'] = df.keyword.str.lower()

In [88]:
# aplicamos TF-IDF seteando un maximo de 1500 palabras
tfidf = TfidfVectorizer(max_features=1500, lowercase=True, analyzer='word', stop_words= 'english',ngram_range=(1,1))


train_vect = tfidf.fit_transform(df['text_clean'])                                             

# lo pasamos a dataframe
df_tf_idf = pd.DataFrame(data = train_vect.todense(), columns = tfidf.get_feature_names())

In [89]:
# como existe la columna target en el analisis tf-idf, la modifico para hacer el concat con 'target' y que no se duplique
df_tf_idf['targ'] = df_tf_idf['target']
df_tf_idf = df_tf_idf.drop(['target'], axis=1)

In [90]:
df['target'].value_counts()  # chequeamos si los labels estan balanceados

0    4342
1    3271
Name: target, dtype: int64

In [91]:
# eliminamos col que contienen texto y conservamos unicamente las numericas
df_num = df.drop([ 'keyword', 'text', 'text_clean', 'word_tokenize', 'word_lemmatizer'], axis=1)

In [92]:
df_train = pd.concat([df_num, df_tf_idf], axis=1)  # revisar porque aparecen dos columnas llamadas target

In [93]:
# separamos el target del resto de los features

y = df_train.target    
X = df_train.drop('target', axis=1)

In [99]:
# hacemos division entre train y test para cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)      

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5329, 1508), (2284, 1508), (5329,), (2284,))

In [102]:
# estandarizamos las features
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)

**Naive Bayes**

In [112]:
nb = naive_bayes.GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [113]:
nb_validation_predictions = nb.predict(X_test)
nb_training_predictions = nb.predict(X_train)

In [115]:
nb.score(X_train, y_train), nb.score(X_test,y_test)

(0.81478701444924, 0.7631348511383538)

**SVM**

In [116]:
from sklearn import svm

In [None]:
svm_ = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_.fit(X_train, y_train)

In [None]:
svm_.score(X_train, y_train), svm_.score(X_test,y_test)