### Imports

In [15]:
from __future__ import print_function, division
import itertools
import os
from IPython.core.debugger import set_trace

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

# from keras.preprocessing import text, sequence

## Define text preprocessing functions

In [16]:
# Applying a first round of text cleaning techniques
import re, string
from bs4 import BeautifulSoup

def clean_text(text):
    text = BeautifulSoup(text, 'lxml').get_text()
    eyes = "[8:=;]"
    nose = "['`\-]?"
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*"," ", text)    
    
    text = re.sub("/"," / ", text)
    text = re.sub('@(\w+)', '', text)
    
    text = re.sub('#{eyes}#{nose}[)d]+|[)d]+#{nose}#{eyes}', "<smile>", text)
    text = re.sub('#{eyes}#{nose}p+', "<lolface>", text)
    text = re.sub('#{eyes}#{nose}\(+|\)+#{nose}#{eyes}', "<sadface>", text)
    text = re.sub('#{eyes}#{nose}[\/|l*]', "<neutralface>", text)
    text = re.sub('<3',"<heart>", text)
    # numbers
    text = re.sub('[-+]?[.\d]*[\d]+[:,.\d]*', " ", text)
    
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    #text = re.sub('\[.*?\]', '', text)
    #text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation.replace('<', '').replace('>', '')), ' ', text)
    text = re.sub('\n', ' ', text)
    
    #text = re.sub(r"[^a-zA-Z]", ' ', text)
    text = ''.join(filter(lambda x: x in string.printable, text))
    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    
    #text = re.sub('\w*\d\w*', '', text)    
    
    return text

import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def text_preprocessing(text):
   
    tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    
    lemmatizer = nltk.stem.WordNetLemmatizer() 
  
    nopunc = clean_text(text)
    
    tokenized_text = tokenizer.tokenize(nopunc)
    
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    
    lemmatized = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(remove_stopwords)]
    
    combined_text = ' '.join(lemmatized)
    return combined_text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Marci\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Read data or preprocessed data if it exists

In [17]:
'''
try:
    train = pd.read_csv('../input/preprocessed_train.csv')
    print('Preprocessed training data shape: ', train.shape)
    test = pd.read_csv('../input/preprocessed_test.csv')
    print('Preprocessed testing data shape: ', test.shape)
    
except:
    train = pd.read_csv('../input/train.csv')
    print('Training data shape: ', train.shape)
    test = pd.read_csv('../input/test.csv')
    print('Testing data shape: ', test.shape)
    
    train['text'] = train['text'].apply(lambda x: text_preprocessing(x))
    test['text'] = test['text'].apply(lambda x: text_preprocessing(x))
    train.to_csv('../input/preprocessed_train.csv')
    test.to_csv('../input/preprocessed_test.csv')

train.drop(["keyword", "location"], axis = 1, inplace=True)
test.drop(["keyword", "location"], axis = 1, inplace=True)
'''

'\ntry:\n    train = pd.read_csv(\'../input/preprocessed_train.csv\')\n    print(\'Preprocessed training data shape: \', train.shape)\n    test = pd.read_csv(\'../input/preprocessed_test.csv\')\n    print(\'Preprocessed testing data shape: \', test.shape)\n    \nexcept:\n    train = pd.read_csv(\'../input/train.csv\')\n    print(\'Training data shape: \', train.shape)\n    test = pd.read_csv(\'../input/test.csv\')\n    print(\'Testing data shape: \', test.shape)\n    \n    train[\'text\'] = train[\'text\'].apply(lambda x: text_preprocessing(x))\n    test[\'text\'] = test[\'text\'].apply(lambda x: text_preprocessing(x))\n    train.to_csv(\'../input/preprocessed_train.csv\')\n    test.to_csv(\'../input/preprocessed_test.csv\')\n\ntrain.drop(["keyword", "location"], axis = 1, inplace=True)\ntest.drop(["keyword", "location"], axis = 1, inplace=True)\n'

In [18]:
train = pd.read_csv('../input/train.csv')
print('Training data shape: ', train.shape)
test = pd.read_csv('../input/test.csv')
print('Testing data shape: ', test.shape)

train['text'] = train['text'].apply(lambda x: text_preprocessing(x))
test['text'] = test['text'].apply(lambda x: text_preprocessing(x))

train.drop(["keyword", "location"], axis = 1, inplace=True)
test.drop(["keyword", "location"], axis = 1, inplace=True)

train.to_csv('../input/preprocessed_train.csv')
test.to_csv('../input/preprocessed_test.csv')

Training data shape:  (7613, 5)
Testing data shape:  (3263, 4)


### Tokenize

In [19]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train['text'])
test_tfidf = tfidf.transform(test["text"])

### Define Models

In [20]:
from sklearn import metrics

### Naive Bayes

In [21]:
# Fitting a simple Naive Bayes on TFIDF
clf_NB_TFIDF = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB_TFIDF, train_tfidf, train["target"], cv=5, scoring="f1")
scores

array([0.57666345, 0.59195894, 0.62900506, 0.61439114, 0.74363057])

In [22]:
clf_NB_TFIDF.fit(train_tfidf, train["target"])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### XGBoost

In [23]:
import xgboost as xgb
clf_xgb_TFIDF = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
scores = model_selection.cross_val_score(clf_xgb_TFIDF, train_tfidf, train["target"], cv=5, scoring="f1")
scores

array([0.49414271, 0.42026266, 0.45822994, 0.38115632, 0.59536542])

In [24]:
clf_xgb_TFIDF.fit(train_tfidf, train["target"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=10, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
clf_gb_TFIDF = GradientBoostingClassifier(n_estimators=100)
scores = model_selection.cross_val_score(clf_gb_TFIDF, train_tfidf, train["target"], cv=5, scoring="f1")
scores
#print(metrics.classification_report(y_test, predicted))

array([0.44540541, 0.34349593, 0.43854996, 0.32488479, 0.51889683])

In [26]:
clf_gb_TFIDF.fit(train_tfidf, train["target"])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

### Predict and Create submission

In [27]:
def submission(submission_file_path,model,test_vectors):
    sample_submission = pd.read_csv(submission_file_path)
    sample_submission["target"] = model.predict(test_vectors)
    sample_submission.to_csv("submission.csv", index=False)

In [28]:
submission_file_path = "../input/sample_submission.csv"
#submission(submission_file_path,clf_NB_TFIDF,test_tfidf)
submission(submission_file_path,clf_gb_TFIDF,test_tfidf)

In [29]:
def prec_rec_F1(labels, preds):
    # true positives
    tp = 0
    # false negatives
    fn = 0
    for label, pred in zip(labels, preds):
        if label == 1:
            if pred == 1:
                tp += 1
            else:
                fn += 1
                
    pospreds = sum(preds)
    precision = tp / pospreds
    recall = tp / (fn + tp)
    try:
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        return (precision, recall, 0.0)
    return (precision, recall, f1)