# NLP Tweet Analyzer Kaggle Submission

In [6]:
import os
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors

# NLP Preprocessing
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# NLP Viz
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Data Modelling
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, recall_score
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import xgboost
from xgboost import XGBClassifier
xgboost.config_context(verbosity=0) # Silect XGBoost

# Monitoring progress
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

nltk.download('stopwords')

config = {
    'test_size': 0.2,
    'CV_splits': 5,
    'seed': 14,
    'n_cores': 16,
    'max_features': 10000,
    'refresh_embedding': False
}

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/miguelcachosoblechero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Generate Embeddings
def average_embedding(tweet, word_embeddings, missing_random=False, dimension=300):
    
    # Create target embedding
    final_emb = np.zeros(dimension)
    num_words = len(tweet)

    # For each word in sentence
    for word in tweet:

        # Look for key
        if word in word_embeddings:
            final_emb += word_embeddings[word]

        # If missing...
        else:
            if missing_random:
                final_emb += np.random.rand(dimension)
            else:
                final_emb += np.zeros(dimension)
                
    # Average embeddings
    if num_words == 0:
        return np.zeros(dimension)
    else:
        return final_emb/num_words

In [8]:
# 1- Preprocessing and Feature Engineering
# Load dataset
input_path = "../input"
raw_tweets_train = pd.read_csv(os.path.join(input_path, "train.csv")).drop(['id'], axis=1)
raw_tweets_test = pd.read_csv(os.path.join(input_path, "test.csv"))

# Extract data and labels
X_train = raw_tweets_train.drop(['target'], axis=1)
y_train = raw_tweets_train.target.values
X_test = raw_tweets_test

# Tokenize + Stop Words + BoW
CountVec = CountVectorizer(stop_words='english', max_features=config['max_features'])
X_train_bow = CountVec.fit_transform(X_train.text)
X_test_bow = CountVec.transform(X_test.text)

# TfidfVectorizer can be used to perform this action in normal text
CountVecTFIDF = TfidfVectorizer(stop_words='english', max_features=config['max_features'])
X_train_tfidf = CountVecTFIDF.fit_transform(X_train.text)
X_test_tfidf = CountVecTFIDF.transform(X_test.text)

# Download or load from local
if config['refresh_embedding']:
    # Download Google's pre-trained Word2Vec model
    word2vec = api.load('word2vec-google-news-300')
    # Save the model for future reuse
    word2vec.save_word2vec_format('../Word2Vec/word2vec_300.kv')
else:
    word2vec = KeyedVectors.load_word2vec_format('../Word2Vec/word2vec_300.kv')

# Create embeddings by averaging the sentence
X_train_embeddings = pd.DataFrame(X_train.text.apply(average_embedding, word_embeddings=word2vec, missing_random=False).tolist())
X_test_embeddings = pd.DataFrame(X_test.text.apply(average_embedding, word_embeddings=word2vec, missing_random=False).tolist())

In [4]:
# 2- Data modelling
# Select your model
target_model = MultinomialNB()

# Train your model
target_model.fit(X_train_embeddings, y_train)

# Generate predictions
results = target_model.predict(X_test_embeddings)

# Store results
pd.DataFrame({"id": raw_tweets_test.id,
              "target": results}).set_index("id").to_csv("../submission/nlp_submission.csv")