In [7]:
import numpy as np
import pandas as pd
from contextlib import contextmanager
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import time
import re
import string
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import gc
from collections import defaultdict
import os
import psutil

In [29]:
# Contraction replacement patterns
cont_patterns = [
    (b'(W|w)on\'t', b'will not'),
    (b'(C|c)an\'t', b'can not'),
    (b'(I|i)\'m', b'i am'),
    (b'(A|a)in\'t', b'is not'),
    (b'(\w+)\'ll', b'\g<1> will'),
    (b'(\w+)n\'t', b'\g<1> not'),
    (b'(\w+)\'ve', b'\g<1> have'),
    (b'(\w+)\'s', b'\g<1> is'),
    (b'(\w+)\'re', b'\g<1> are'),
    (b'(\w+)\'d', b'\g<1> would'),
]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]

In [8]:
@contextmanager
def timer(name):
    """
    https://www.kaggle.com/lopuhin/mercari-golf-0-3875-cv-in-75-loc-1900-s
    """
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [16]:
def prepare_for_char_n_gram(text):
    """ Simple text clean up process"""
    # 1. Go to lower case (only good for english)
    # Go to bytes_strings as I had issues removing all \n in r""
    clean = bytes(text.lower(), encoding="utf-8")
    # 2. Drop \n and  \t
    clean = clean.replace(b"\n", b" ")
    clean = clean.replace(b"\t", b" ")
    clean = clean.replace(b"\b", b" ")
    clean = clean.replace(b"\r", b" ")
    # 3. Replace english contractions
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    # 4. Drop puntuation
    # I could have used regex package with regex.sub(b"\p{P}", " ")
    exclude = re.compile(b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8')))
    clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])
    # 5. Drop numbers - as a scientist I don't think numbers are toxic ;-)
    clean = re.sub(b"\d+", b" ", clean)
    # 6. Remove extra spaces - At the end of previous operations we multiplied space accurences
    clean = re.sub(b'\s+', b' ', clean)
    # Remove ending space if any
    clean = re.sub(b'\s+$', b'', clean)
    # 7. Now replace words by words surrounded by # signs
    # e.g. my name is bond would become #my# #name# #is# #bond#
    # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean)
    clean = re.sub(b" ", b"# #", clean)  # Replace space
    clean = b"#" + clean + b"#"  # add leading and trailing #

    return str(clean, 'utf-8')

In [9]:
def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    return len(re.findall(regexp, text))

In [13]:
def get_indicators_and_clean_comments(df):
    """
    Check all sorts of content as it may help find toxic comment
    Though I'm not sure all of them improve scores
    """
    # Count number of \n
    df["ant_slash_n"] = df["description"].apply(lambda x: count_regexp_occ(r"\n", x))
    # Get length in words and characters
    df["raw_word_len"] = df["description"].apply(lambda x: len(x.split()))
    df["raw_char_len"] = df["description"].apply(lambda x: len(x))
    # Check number of upper case, if you're angry you may write in upper case
    df["nb_upper"] = df["description"].apply(lambda x: count_regexp_occ(r"[A-Z]", x))
    # Number of F words - f..k contains folk, fork,
    df["nb_fk"] = df["description"].apply(lambda x: count_regexp_occ(r"[Ff]\S{2}[Kk]", x))
    # Number of S word
    df["nb_sk"] = df["description"].apply(lambda x: count_regexp_occ(r"[Ss]\S{2}[Kk]", x))
    # Number of D words
    df["nb_dk"] = df["description"].apply(lambda x: count_regexp_occ(r"[dD]ick", x))
    # Number of occurence of You, insulting someone usually needs someone called : you
    df["nb_you"] = df["description"].apply(lambda x: count_regexp_occ(r"\W[Yy]ou\W", x))
    # Just to check you really refered to my mother ;-)
    df["nb_mother"] = df["description"].apply(lambda x: count_regexp_occ(r"\Wmother\W", x))
    # Just checking for toxic 19th century vocabulary
    df["nb_ng"] = df["description"].apply(lambda x: count_regexp_occ(r"\Wnigger\W", x))
    # Some Sentences start with a <:> so it may help
    df["start_with_columns"] = df["description"].apply(lambda x: count_regexp_occ(r"^\:+", x))
    # Check for time stamp
    df["has_timestamp"] = df["description"].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
    # Check for dates 18:44, 8 December 2010
    df["has_date_long"] = df["description"].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))
    # Check for date short 8 December 2010
    df["has_date_short"] = df["description"].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))
    # Check for http links
    df["has_http"] = df["description"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
    # check for mail
    df["has_mail"] = df["description"].apply(
        lambda x: count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x)
    )
    # Looking for words surrounded by == word == or """" word """"
    df["has_emphasize_equal"] = df["description"].apply(lambda x: count_regexp_occ(r"\={2}.+\={2}", x))
    df["has_emphasize_quotes"] = df["description"].apply(lambda x: count_regexp_occ(r"\"{4}\S+\"{4}", x))

    # Now clean comments
    df["clean_comment"] = df["description"].apply(lambda x: prepare_for_char_n_gram(x))

    # Get the new length in words and characters
    df["clean_word_len"] = df["clean_comment"].apply(lambda x: len(x.split()))
    df["clean_char_len"] = df["clean_comment"].apply(lambda x: len(x))
    # Number of different characters used in a comment
    # Using the f word only will reduce the number of letters required in the comment
    df["clean_chars"] = df["clean_comment"].apply(lambda x: len(set(x)))
    df["clean_chars_ratio"] = df["clean_comment"].apply(lambda x: len(set(x))) / df["clean_comment"].apply(
        lambda x: 1 + min(99, len(x)))

In [22]:
with timer("Reading input files"):
    train = pd.read_csv('train.csv').fillna(' ')
    test = pd.read_csv('test.csv').fillna(' ')

[Reading input files] done in 1 s


In [28]:
test.head()

Unnamed: 0,index,country,description,designation,points,price,province,region_1,region_2,taster_name,...,has_date_short,has_http,has_mail,has_emphasize_equal,has_emphasize_quotes,clean_comment,clean_word_len,clean_char_len,clean_chars,clean_chars_ratio
0,41855,US,"Sweet new oak stands out, giving this Cabernet...",3D,95.036469,,California,St. Helena,Napa,,...,0,0,0,0,0,#sweet# #new# #oak# #stands# #out# #giving# #t...,58,447,25,0.25
1,10328,France,Charming raspberry aromas intertwine with scen...,Heluicum,90.966405,,Rhône Valley,Collines Rhôdaniennes,,,...,0,0,0,0,0,#charming# #raspberry# #aromas# #intertwine# #...,37,296,25,0.25
2,60094,US,"Ripe blackberry, leather and soy show on the n...",Estate,88.964358,,California,Santa Ynez Valley,Central Coast,Matt Kettmann,...,0,0,0,0,0,#ripe# #blackberry# #leather# #and# #soy# #sho...,55,405,25,0.25
3,48333,US,White flowers and wild anise give this a crisp...,,89.960356,,California,Sonoma Coast,Sonoma,Virginie Boone,...,0,0,0,0,0,#white# #flowers# #and# #wild# #anise# #give# ...,32,255,24,0.24
4,14498,US,Pinot Gris has been making inroads in Washingt...,,88.075501,,Washington,Columbia Valley (WA),Columbia Valley,Sean P. Sullivan,...,0,0,0,0,0,#pinot# #gris# #has# #been# #making# #inroads#...,35,268,24,0.24


In [24]:
with timer("Performing basic NLP"):
    get_indicators_and_clean_comments(train)
    get_indicators_and_clean_comments(test)

[Performing basic NLP] done in 53 s


In [26]:
train.columns

Index(['country', 'description', 'designation', 'points', 'price', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery', 'id', 'ant_slash_n', 'raw_word_len',
       'raw_char_len', 'nb_upper', 'nb_fk', 'nb_sk', 'nb_dk', 'nb_you',
       'nb_mother', 'nb_ng', 'start_with_columns', 'has_timestamp',
       'has_date_long', 'has_date_short', 'has_http', 'has_mail',
       'has_emphasize_equal', 'has_emphasize_quotes', 'clean_comment',
       'clean_word_len', 'clean_char_len', 'clean_chars', 'clean_chars_ratio'],
      dtype='object')

In [27]:
# Scaling numerical features with MinMaxScaler though tree boosters don't need that
with timer("Creating numerical features"):
    num_features = [f_ for f_ in train.columns
                    if f_ not in ["comment_text", "clean_comment", "id", "remaining_chars",
                                      'has_ip_address', 'target','country', 'description', 'designation', 'points', 'province',
       'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title',
       'variety', 'winery', 'id']]
    
    skl = MinMaxScaler()
    train_num_features = csr_matrix(skl.fit_transform(train[num_features]))
    test_num_features = csr_matrix(skl.fit_transform(test[num_features]))
  
    
# Get TF-IDF features
train_text = train['description']
test_text = test['description']
all_text = pd.concat([train_text, test_text])
# First on real words

  return self.partial_fit(X, y)


ValueError: could not convert string to float: 

In [None]:
train_df = pd.read_pickle("data/train_df.pkl")