# NLP Tweet Analyzer Kaggle Submission

In [47]:
import os
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors

# NLP Preprocessing
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# NLP Viz
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Data Modelling - ML
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, recall_score
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import xgboost
from xgboost import XGBClassifier
xgboost.config_context(verbosity=0) # Silect XGBoost

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model, Sequential
from keras.initializers import Constant
from keras.callbacks import EarlyStopping

# Monitoring progress
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

nltk.download('stopwords')

config = {
    'val_size': 0.2,
    'CV_splits': 5,
    'seed': 14,
    'n_cores': 16,
    'max_features': 2000,
    'refresh_embedding': False,
    'max_tweet_length': 100,
    'embedding_dim': 300,
    'num_epochs': 30,
    'patience': 3
}

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/miguelcachosoblechero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
# 1- Preprocessing and Feature Engineering
# Load dataset
input_path = "../input"
raw_tweets_train = pd.read_csv(os.path.join(input_path, "train.csv")).drop(['id'], axis=1)
raw_tweets_test = pd.read_csv(os.path.join(input_path, "test.csv"))

# Extract data and labels
X_train = raw_tweets_train.drop(['target'], axis=1)
y_train = raw_tweets_train.target.values
X_test = raw_tweets_test

# Use Twitter Tokenizer to tokenize tweets
tokenizer = TweetTokenizer()
X_train['twitterTokens'] = X_train.apply(lambda x: tokenizer.tokenize(x.text.lower()), axis=1)
X_test['twitterTokens'] = X_test.apply(lambda x: tokenizer.tokenize(x.text.lower()), axis=1)

# Remove stop words
english_stopwords = set(stopwords.words('english'))
X_train['twitterTokens_noStop'] = X_train.twitterTokens.apply(lambda x: [i for i in x if i not in english_stopwords])
X_test['twitterTokens_noStop'] = X_test.twitterTokens.apply(lambda x: [i for i in x if i not in english_stopwords])

# Tokenize + Stop Words + BoW
CountVec = CountVectorizer(stop_words='english', max_features=config['max_features'])
X_train_bow = CountVec.fit_transform(X_train.text)
X_test_bow = CountVec.transform(X_test.text)

# TfidfVectorizer can be used to perform this action in normal text
CountVecTFIDF = TfidfVectorizer(stop_words='english', max_features=config['max_features'])
X_train_tfidf = CountVecTFIDF.fit_transform(X_train.text)
X_test_tfidf = CountVecTFIDF.transform(X_test.text)

In [37]:
# Preprocess data for Deep Learning
# Tokenize using Keras interface
keras_CountVect = Tokenizer()
keras_CountVect.fit_on_texts(X_train.text) 
X_train_tokens = keras_CountVect.texts_to_sequences(X_train.text)
X_test_tokens = keras_CountVect.texts_to_sequences(X_test.text)
word_index = keras_CountVect.word_index # <- This is effectively a dictionary with all the required words, used later in the Embeddings
num_words = len(word_index)+1

# Pad the sequences
X_train_padded = pad_sequences(X_train_tokens, maxlen=config['max_tweet_length'])
X_test_padded = pad_sequences(X_test_tokens, maxlen=config['max_tweet_length'])
y_train = to_categorical(y_train)

# Divide between train and validation, by shuffling the indices
indices = np.arange(X_train_padded.shape[0])
np.random.shuffle(indices)
X_train_padded = X_train_padded[indices]
y_train = y_train[indices]
num_validation_samples = int(config['val_size'] * X_train_padded.shape[0])
X_train_padded = X_train_padded[:num_validation_samples]
y_train_cat = y_train[:num_validation_samples]
X_val_padded = X_train_padded[-num_validation_samples:]
y_val_cat = y_train[-num_validation_samples:]

# Now we are ready to load the associated Embeddings
# Download or load from local
# if config['refresh_embedding']:
#     # Download Google's pre-trained Word2Vec model
#     word2vec = api.load('word2vec-google-news-300')
#     # Save the model for future reuse
#     word2vec.save_word2vec_format('../Word2Vec/word2vec_300.kv')
# else:
#     word2vec = KeyedVectors.load_word2vec_format('../Word2Vec/word2vec_300.kv')

# Associate each index with its embedding
# Create an embedding matrix as big as the words available
embedding_matrix = np.zeros((num_words, config['embedding_dim']))
# For each word in the dictionary, populate the embedding matrix
for word, idx in word_index.items():
    # If word is in embedding, store
    if word in word2vec:
        # Store the new embedding vector in the associated position
        # Otherwise, leave blank as zero
        embedding_matrix[idx] = word2vec[word]

In [15]:
# 2- Data modelling - ML
# Select your model
# target_model = XGBClassifier(n_estimators=20, max_depth=50, random_state=config['seed'], n_jobs = config['n_cores'])
target_model = MultinomialNB()

# Train your model
target_model.fit(X_train_tfidf, y_train)

# Generate predictions
results = target_model.predict(X_test_tfidf)

# Store results
pd.DataFrame({"id": raw_tweets_test.id,
              "target": results}).set_index("id").to_csv("../submission/nlp_submission_ml.csv")

<class 'numpy.ndarray'>


In [48]:
# Define the embeddings to use in this training
embedding_layer = Embedding(num_words,
                            config['embedding_dim'],
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=config['max_tweet_length'],
                            trainable=False)

# Define trainable model
target_model = Sequential()
target_model.add(embedding_layer)
target_model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
target_model.add(Dense(2, activation='sigmoid'))
target_model.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy', 'Recall', 'AUC'])

target_model.fit(X_train_padded, y_train_cat,
        batch_size=32,
        epochs=config['num_epochs'],
        validation_data=(X_val_padded, y_val_cat),
        callbacks=[EarlyStopping(monitor="val_accuracy", patience=config['patience'])]
        )

# Generate predictions
results = target_model.predict(X_test_padded)

# Store results
pd.DataFrame({"id": raw_tweets_test.id,
              "target": np.argmax(results, axis=1)}).set_index("id").to_csv("../submission/nlp_submission_dl.csv")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
