In [1]:
import nltk
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, KFold

## Load data

In [2]:
DATA_DIR = '../data/nlp-getting-started'
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


## Text processing

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package punkt to /home/mykyta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mykyta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def process(text):
    text = text.lower()
    tokens = nltk.tokenize.word_tokenize(text)
    tokens = [token for token in tokens if token.isalnum()]
    return " ".join([token for token in tokens if token not in stop_words])

In [8]:
train['processed_text'] = train['text'].apply(process)
test['processed_text'] = test['text'].apply(process)

## Modeling

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train['processed_text'], train['target'])

In [10]:
classifier = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])
classifier.fit(X_train, y_train)

In [11]:
pred = classifier.predict(X_test)
f1_score(y_test, pred)

0.7479674796747967

## Checking LB consistency
Trying different models on cross validation to see how f1 metric correlates with competition score

In [12]:
models = [
    ('lr', Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', LogisticRegression())
    ])),
    ('svc', Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', SVC())
    ])),
    ('gbc', Pipeline([
        ('vectorizer', TfidfVectorizer()),
        ('classifier', GradientBoostingClassifier())
    ]))
]

In [13]:
results = []
for model_name, model in models:
    cv = KFold()
    scores = []
    for train_ids, test_ids in cv.split(train):
        X_train, y_train = train['processed_text'].iloc[train_ids], train['target'].iloc[train_ids]
        X_test, y_test = train['processed_text'].iloc[test_ids], train['target'].iloc[test_ids]
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        scores.append(f1_score(y_test, pred, average='macro'))

    scores_avg = round(np.mean(scores), 4)
    scores_std = round(np.std(scores), 4)
    scores_min_max = round(np.min(scores), 4), round(np.max(scores), 4)
    scores_lower_higher_bound = round(scores_avg - 3*scores_std, 4), round(scores_avg + 3*scores_std, 4)

    results.append({
        'Model': model_name,
        'Avg': scores_avg,
        'Std': scores_std,
        'Min_Max': scores_min_max,
        'Lower_higher_bound': scores_lower_higher_bound
    })
    model.fit(train['processed_text'], train['target'])
pd.DataFrame(results)

Unnamed: 0,Model,Avg,Std,Min_Max,Lower_higher_bound
0,lr,0.7301,0.0247,"(0.7053, 0.7748)","(0.656, 0.8042)"
1,svc,0.7137,0.0278,"(0.6762, 0.7531)","(0.6303, 0.7971)"
2,gbc,0.6064,0.0187,"(0.5768, 0.6325)","(0.5503, 0.6625)"


### Kaggle submission

In [14]:
for model_name, model in models:
    pred = model.predict(test['processed_text'])
    submission = pd.DataFrame({'id': test['id'], 'target': pred})
    submission.to_csv(f'{DATA_DIR}/submissions/{model_name}_submission.csv', index=False)

![title](images/nlp_scores.png)

As we can see leaderbord score is in bounds for logistic regression and svc, but it is much higher then gbc cross validation avg.

Leaderbord score have correlation with cv score change: gbc score is the smallest, lr and cvs scores are similar.

## Adversarial validation
Using classifier to predict whether text is from train or test set

In [15]:
av_train = pd.DataFrame(train['processed_text'])
av_test = pd.DataFrame(test['processed_text'])
av_train['target'] = len(train) * [0]
av_test['target'] = len(test) * [1]
av_df = pd.concat([av_train, av_test]).reset_index()

In [16]:
av_X_train, av_X_test, av_y_train, av_y_test = train_test_split(av_df['processed_text'], av_df['target'], random_state=76)

In [17]:
classifier.fit(av_X_train, av_y_train)

In [18]:
pred = classifier.predict_proba(av_X_test)[:, 1]
roc_auc_score(av_y_test, pred)

0.4940137361038356

Since ROCAUC~0.5 model can't differentiate train and test split, therefore we don't have significant feature distribution shift