In [43]:
# ! conda install -q -y -c conda-forge spacy
# ! conda install -q -y -c anaconda nltk
# ! python -m spacy download en_core_web_md
# ! python -m spacy download es_core_news_md
# ! pip install --quiet langdetect

In [147]:
import spacy
import nltk
import numpy as np
import pandas as pd
from sklearn.metrics import fbeta_score, make_scorer, recall_score, precision_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, VectorizerMixin

In [None]:
nltk.download('averaged_perceptron_tagger')

In [47]:
import re
from re import sub, split, findall
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import strip_accents_unicode

def clean_html(s):
    """ Converts all HTML elements to Unicode """
    try:
        s = sub(r'https?://[^\s]+', '', s)
        s = sub(r'@\w+', '', s) #remove @'s from tweets
        return BeautifulSoup(s, 'html5lib').get_text() if s else ''
    except UserWarning:
        return ''
    except Exception as e:
        print(e)
        return ''

def split_numbers(s):
    return ' '.join(split('(\d+)[^\d\s]+', s))

def round_numbers(m, lim = 300):
    n = int(m.group(1))
    if n < 1:
        return ''
    if n < lim:
        return 'SMALLNUMBER'
    else:
        return 'LARGENUMBER'

def tokenize_numbers(s):
    return sub('(\d+)', round_numbers, s)

def tokenize_short(s, lim = 5):
    token_pattern = re.compile(r"(?u)\b\w\w+\b")
    tokens = token_pattern.findall(s)
    if (len(tokens) < lim):
        return 'SHORTARTICLE ' + s
    else:
        return s

def format_numbers(s):
    decomma = lambda m: m.group(1) + m.group(2)
    s = sub('(\d+),(\d+)', decomma, s)
    return s

def preprocessor(s):
    s = clean_html(s)
    s = format_numbers(s)
    s = split_numbers(s)
    s = tokenize_numbers(s)
    s = strip_accents_unicode(s.lower())
    s = tokenize_short(s)
    return s

In [None]:
nlp = spacy.load('en_core_web_md')

In [235]:
import re
from langdetect import detect
from sklearn.feature_extraction.text import VectorizerMixin

def ngrammer(tokens, ngram_range):
    mix = VectorizerMixin()    
    mix.ngram_range = ngram_range
    return mix._word_ngrams(tokens)

def analyzer(s, ngram_range = (1,2)):
    s = preprocessor(s)
    if detect(s) != 'en':
        pattern = re.compile(r"(?u)\b\w\w+\b")
        unigrams = pattern.findall(s)    
    else:
        tokens = nlp(s)
        filtered = [t for t in tokens if not t.is_stop and not t.dep_ in ['', 'punct']]
        unigrams = [':'.join([t.lemma_, t.dep_]) for t in filtered]
    return ngrammer(unigrams, ngram_range)

    # tokens = pattern.findall(s)
    # with_pos = nltk.pos_tag(tokens)
    # return [':'.join(w) for w in with_pos]

In [236]:
import pandas as pd

X_train = pd.read_csv('kaggle/train.csv').tweet
y_train = pd.read_csv('kaggle/train.csv').label
y_test = pd.read_csv('kaggle/solution.csv').label
X_test = pd.read_csv('kaggle/test.csv').tweet

In [237]:
def vectorizer(X_train, X_test, analyzer = analyzer):
    idx = X_train.shape[0]
    X = pd.concat([X_train, X_test])
    vector = TfidfVectorizer(analyzer = analyzer).fit(X).transform(X)
    return vector[0:idx], vector[idx:]

In [238]:
V_train, V_test = vectorizer(X_train, X_test)

In [239]:
from sklearn.naive_bayes import MultinomialNB

# model = MultinomialNB(class_prior = [0.5,0.5])
model = base_model()
model.fit(V_train, y_train)
three_preds = model.predict_proba(V_test)

In [241]:
from sklearn.svm import LinearSVC

model = LinearSVC(tol = 10e-6, max_iter = 8000)
model.fit(V_train, y_train)
three_svc = model.decision_function(V_test)

In [240]:
p = three_preds > .5
print(precision_score(y_test, p), recall_score(y_test, p), f1_score(y_test, p))
p.sum() / y_test.shape[0]

0.9375 0.759493670886 0.839160839161


0.25974025974025972

In [242]:
p = three_svc > -.3
# p = three_preds > .5
print (precision_score(y_test, p), recall_score(y_test, p), f1_score(y_test, p))
p.sum() / y_test.shape[0]

0.869918699187 0.812658227848 0.840314136126


0.29951298701298701

In [9]:

nltk.pos_tag('a migrant was stabbed'.split())

[('a', 'DT'), ('migrant', 'NN'), ('was', 'VBD'), ('stabbed', 'VBN')]

In [None]:
nlp = spacy.load('en')

In [223]:
from notebook.auth import passwd



'sha1:ed21921c94d1:2af52eda4765ea12514f45468fac48418c6c8ec5'