In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/sample_submission.csv


In [2]:
import numpy as np
import pandas as pd
import string  


import nltk
import re

import scipy
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


from sklearn import metrics

In [3]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [4]:
df = train
df['text'] = df['text'].str.lower()   # pasamos a lowercase
df = df.drop(['id', 'location'], axis=1)

In [5]:
# generamos una columna que indica la cantidad de links a  enlaces externos
df['link'] = df['text'].apply(lambda x: x.count('http'))

# generamos una columna que indica la cantidad de referencias a otras cuentas de twitter
df['contact'] = df['text'].apply(lambda x: x.count('@'))

# generamos una columna que indica la cantidad de hashtags
df['hashtag'] = df['text'].apply(lambda x: x.count('#'))

# generamos una columna que indica la cantidad de digitos
df['numerics'] = df['text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

# calculamos la longitud del tweet andes de limpiar
df['length'] = df['text'].str.len()

In [6]:
# calculamos la cantidad de palabras antes de limpiar
def count_words(text):
    '''
    Funcion que toma un texto y devuelve la cantidad de palabras
    '''
    word_counts = len(text.split(' '))
    return word_counts

df['words'] = df['text'].apply(count_words)

In [7]:
# limpiamos el texto eliminando urls, cuentas, hashtags y numeros

def clean_str(string):
    string = re.sub(r'https?\://\S+', '', string)
    string = re.sub(r'http?\://\S+', '', string)
    string = re.sub(r'@\w*\s', '', string)
    string = re.sub(r'#\w*\s', '', string)
    string = re.sub(r'\d', '', string)
    return string

df['text_clean'] = df['text'].apply(lambda x: clean_str(str(x)))

In [8]:
# eliminamos stopwords

stop = stopwords.words('english')

df['text_clean'] = df['text_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [10]:
# eliminamos signos de puntuacion y caracteres especiales
df['text_clean'] = df['text_clean'].str.replace('[^\w\s]','')

In [11]:
df['word_tokenize'] = df['text_clean'].apply(lambda x: word_tokenize(x))

In [12]:
def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text

In [13]:
# unificamos palabras que poseen la misma raiz aplicando la funcion word_lematizer
df['word_lemmatizer'] = df['word_tokenize'].apply(lambda x: word_lemmatizer(x))

In [14]:
# unificamos la lista de tokens para poder analizar el texto limpio
df['text_clean'] = df['word_lemmatizer'].str.join(' ')

In [15]:
# calculamos nuevamente la longuitud, pero ahora del texto limpio                       
df['length-clean'] = df['text_clean'].str.len()

In [16]:
df['words_clean'] = df['text_clean'].apply(count_words)

In [17]:
df['keyword'] = df.keyword.str.replace('%20', '_')
df['keyword'] = df.keyword.str.lower()

In [18]:
# aplicamos TF-IDF seteando un maximo de 1500 palabras
tfidf = TfidfVectorizer(max_features=1500, lowercase=True, analyzer='word', stop_words= 'english',ngram_range=(1,1))


train_vect = tfidf.fit_transform(df['text_clean'])                                             

# lo pasamos a dataframe
df_tf_idf = pd.DataFrame(data = train_vect.todense(), columns = tfidf.get_feature_names())

In [19]:
# como existe la columna target en el analisis tf-idf, la modifico para hacer el concat con 'target' y que no se duplique
df_tf_idf['targ'] = df_tf_idf['target']
df_tf_idf = df_tf_idf.drop(['target'], axis=1)

In [20]:
# eliminamos col que contienen texto y conservamos unicamente las numericas
df_num = df.drop([ 'keyword', 'text', 'text_clean', 'word_tokenize', 'word_lemmatizer'], axis=1)

In [21]:
df_train = pd.concat([df_num, df_tf_idf], axis=1)  # revisar porque aparecen dos columnas llamadas target

In [22]:
# separamos el target del resto de los features

y = df_train.target    
X = df_train.drop('target', axis=1)

In [23]:
# hacemos division entre train y test para cross validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)      

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5329, 1508), (2284, 1508), (5329,), (2284,))

In [25]:
classifier = neighbors.KNeighborsClassifier()

In [26]:
classifier.fit(X_train,y_train)

KNeighborsClassifier()

In [27]:
classifier.score(X_train, y_train), classifier.score(X_test,y_test)

(0.7470444736348283, 0.6401050788091068)