### Imports

In [0]:
from __future__ import print_function, division
import itertools
import os
from IPython.core.debugger import set_trace

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

# from keras.preprocessing import text, sequence

## Define text preprocessing functions

In [5]:
# Applying a first round of text cleaning techniques
import re, string
from bs4 import BeautifulSoup
tokenizer = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
lemmatizer = nltk.stem.WordNetLemmatizer() 

def clean_text(text):
    text = BeautifulSoup(text, 'lxml').get_text()
    eyes = "[8:=;]"
    nose = "['`\-]?"
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*"," ", text)    
    
    text = re.sub("/"," / ", text)
    text = re.sub('@(\w+)', '', text)
    
    text = re.sub('#{eyes}#{nose}[)d]+|[)d]+#{nose}#{eyes}', "<smile>", text)
    text = re.sub('#{eyes}#{nose}p+', "<lolface>", text)
    text = re.sub('#{eyes}#{nose}\(+|\)+#{nose}#{eyes}', "<sadface>", text)
    text = re.sub('#{eyes}#{nose}[\/|l*]', "<neutralface>", text)
    text = re.sub('<3',"<heart>", text)
    # numbers
    text = re.sub('[-+]?[.\d]*[\d]+[:,.\d]*', " ", text)
    
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    #text = re.sub('\[.*?\]', '', text)
    #text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation.replace('<', '').replace('>', '')), ' ', text)
    text = re.sub('\n', ' ', text)
    
    #text = re.sub(r"[^a-zA-Z]", ' ', text)
    text = ''.join(filter(lambda x: x in string.printable, text))
    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)
    
    #text = re.sub('\w*\d\w*', '', text)    
    
    return text

import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def text_preprocessing(text):
  
    nopunc = clean_text(text)
    
    tokenized_text = tokenizer.tokenize(nopunc)
    
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    
    lemmatized = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v'] else lemmatizer.lemmatize(i) for i,j in pos_tag(remove_stopwords)]
    
    combined_text = ' '.join(lemmatized)
    return combined_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Read data or preprocessed data if it exists

In [7]:
train = pd.read_csv('drive/My Drive/NLP_data/train.csv')
print('Training data shape: ', train.shape)
test = pd.read_csv('drive/My Drive/NLP_data/test.csv')
print('Testing data shape: ', test.shape)

train['text'] = train['text'].apply(lambda x: text_preprocessing(x))
test['text'] = test['text'].apply(lambda x: text_preprocessing(x))

train.drop(["keyword", "location"], axis = 1, inplace=True)
test.drop(["keyword", "location"], axis = 1, inplace=True)

train.to_csv('preprocessed_train.csv')
test.to_csv('preprocessed_test.csv')

Training data shape:  (7613, 5)
Testing data shape:  (3263, 4)


### Tokenize

In [0]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train['text'])
test_tfidf = tfidf.transform(test["text"])

In [9]:
train_tfidf

<7613x10449 sparse matrix of type '<class 'numpy.float64'>'
	with 76809 stored elements in Compressed Sparse Row format>

### Define Models

In [0]:
from sklearn import metrics

### Naive Bayes

In [14]:
# Fitting a simple Naive Bayes on TFIDF
clf_NB_TFIDF = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB_TFIDF, train_tfidf, train["target"], cv=5, scoring="f1")
scores

array([0.57473481, 0.59176672, 0.62847515, 0.61439114, 0.74363057])

In [15]:
clf_NB_TFIDF.fit(train_tfidf, train["target"])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Tune NB

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [0]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])

In [51]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3)}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(train["text"], train["target"])
gs_clf.best_score_, gs_clf.best_params_

(0.6888253956202012,
 {'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)})

In [58]:
text_clf.set_params(clf__alpha=0.01, tfidf__use_idf=False, vect__ngram_range=(1, 1))
scores = model_selection.cross_val_score(text_clf, train['text'], train["target"], cv=5, scoring="f1")
scores

array([0.63237774, 0.61659514, 0.64705882, 0.62538226, 0.71080139])

In [59]:
text_clf.fit(train['text'], train["target"])

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=False)),
                ('clf',
                 MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))],
         verbose=False)

### XGBoost

In [13]:
import xgboost as xgb
clf_xgb_TFIDF = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
scores = model_selection.cross_val_score(clf_xgb_TFIDF, train_tfidf, train["target"], cv=5, scoring="f1")
scores

array([0.507431  , 0.39924314, 0.47359736, 0.38115632, 0.59536542])

In [0]:
clf_xgb_TFIDF.fit(train_tfidf, train["target"])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=10, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)

In [0]:
from sklearn.ensemble import GradientBoostingClassifier
clf_gb_TFIDF = GradientBoostingClassifier(n_estimators=100)
scores = model_selection.cross_val_score(clf_gb_TFIDF, train_tfidf, train["target"], cv=5, scoring="f1")
scores

array([0.44540541, 0.34349593, 0.43854996, 0.32488479, 0.51889683])

In [0]:
clf_gb_TFIDF.fit(train_tfidf, train["target"])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

Logistic Regression CV

In [42]:
from sklearn.linear_model import LogisticRegressionCV 
logregcv = LogisticRegressionCV(cv=5, max_iter=1000, random_state=42, class_weight='balanced').fit(train_tfidf, train["target"])
scores = model_selection.cross_val_score(logregcv, train_tfidf, train["target"], cv=5, scoring="f1")
scores

array([0.61896243, 0.59116466, 0.63057325, 0.56330275, 0.72684825])

In [23]:
logregcv.fit(train_tfidf, train["target"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression

In [41]:
from sklearn.linear_model import LogisticRegression 
logreg = LogisticRegression(max_iter=1000, random_state=42, solver='lbfgs', class_weight='balanced')
scores = model_selection.cross_val_score(logreg, train_tfidf, train["target"], cv=5, scoring="f1")
scores

array([0.62202643, 0.58578053, 0.62319939, 0.58562555, 0.71492537])

In [38]:
logreg.fit(train_tfidf, train["target"])

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

### Predict and Create submission

In [0]:
def submission(submission_file_path,model,test_vectors):
    sample_submission = pd.read_csv(submission_file_path)
    sample_submission["target"] = model.predict(test_vectors)
    sample_submission.to_csv("submission.csv", index=False)

In [0]:
submission_file_path = "drive/My Drive/NLP_data/sample_submission.csv"
submission(submission_file_path,logreg,test_tfidf)