In [None]:
import pandas as pd
import numpy as np
import pickle as pkl
import nltk
import IPython
import scipy
import math
import seaborn as sns
import gensim.downloader as api
from collections import Counter
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier,RandomForestClassifier,ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,make_scorer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from spacy.lang.en.stop_words import STOP_WORDS
from tqdm import tqdm_notebook
from string import punctuation
nltk.download('stopwords', quiet=True, raise_on_error=True)
nltk.download('punkt', quiet=True, raise_on_error=True)
nltk.download('wordnet')

In [None]:
train_df = pd.read_pickle("data/train_df.pkl")
test_df = pd.read_pickle("data/test_df.pkl")

In [None]:
train_df.head(2)

## Preprocess

In [None]:
train_df["text"] = train_df["text"].apply(lambda x: '\n'.join(x))
train_df["keywords"] = train_df["keywords"].apply(lambda x: '\n'.join(x)).replace('', "empty")
train_df["accepted_product"] = train_df["accepted_product"].replace("", "missing_accepted_product")
train_df["accepted_function"] = train_df["accepted_function"].replace("", "missing_accepted_function")
train_df["rejected_product"] = train_df["rejected_product"].replace("", "missing_rejected_product")
train_df["rejected_function"] = train_df["rejected_function"].replace("", "missing_rejected_function")

In [None]:
sentences_count = []
for i in tqdm_notebook(train_df['text']):
    sentences_count.append(len(nltk.sent_tokenize(i)))
train_df['sentences_count'] = pd.DataFrame(sentences_count)

In [None]:
words_count = []
unique_words_count = []
for i in tqdm_notebook(train_df['text']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_count.append(len(words))
    unique_words_count.append(len(set(words)))
train_df['words_count'] = pd.DataFrame(words_count)
train_df['unique_words_count'] = pd.DataFrame(unique_words_count)

In [None]:
words_keywords_count = []
for i in tqdm_notebook(train_df['keywords']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_keywords_count.append(len(words))
train_df['words_keywords_count'] = pd.DataFrame(words_keywords_count)

In [None]:
words_accepted_function_count = []
for i in tqdm_notebook(train_df['accepted_function']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_accepted_function_count.append(len(words))
train_df['words_accepted_function_count'] = pd.DataFrame(words_accepted_function_count)

In [None]:
words_rejected_function_count = []
for i in tqdm_notebook(train_df['rejected_function']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_rejected_function_count.append(len(words))
train_df['words_rejected_function_count'] = pd.DataFrame(words_rejected_function_count)

In [None]:
words_accepted_product_count = []
for i in tqdm_notebook(train_df['accepted_product']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_accepted_product_count.append(len(words))
train_df['words_accepted_product_count'] = pd.DataFrame(words_accepted_product_count)

In [None]:
words_rejected_product_count = []
for i in tqdm_notebook(train_df['rejected_product']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_rejected_product_count.append(len(words))
train_df['words_rejected_product_count'] = pd.DataFrame(words_rejected_product_count)

In [None]:
train_df['character_count'] = train_df['text'].apply(lambda text:len(text))
train_df['accepted_function_character_count'] = train_df['accepted_function'].apply(lambda text:len(text))
train_df['rejected_function_character_count'] = train_df['rejected_function'].apply(lambda text:len(text))
train_df['accepted_product_character_count'] = train_df['accepted_product'].apply(lambda text:len(text))
train_df['rejected_product_character_count'] = train_df['rejected_product'].apply(lambda text:len(text))

In [None]:
wordvecm = api.load('word2vec-google-news-300')

In [None]:
wordvecm.similarity('france', 'spain')

In [None]:
accepted_function_similarity = []
for l in tqdm_notebook(range(train_df.shape[0])):
    sum_sim = 0
    not_in_vocab=0
    denominator = len(train_df['keywords'][l].split())*len(train_df['accepted_function'][l].split())
    for i in train_df['keywords'][l].split():
        for j in train_df['accepted_function'][l].split(','):
            if i in wordvecm.wv.vocab and j in wordvecm.wv.vocab:
                sum_sim+=wordvecm.similarity(i, j)
    accepted_function_similarity.append(sum_sim/denominator)
train_df['accepted_function_similarity'] = pd.DataFrame(accepted_function_similarity)
sns.distplot(train_df['accepted_function_similarity'])

In [None]:
rejected_function_similarity = []
for l in tqdm_notebook(range(train_df.shape[0])):
    sum_sim = 0
    not_in_vocab=0
    denominator = len(train_df['keywords'][l].split())*len(train_df['rejected_function'][l].split())
    for i in train_df['keywords'][l].split():
        for j in train_df['rejected_function'][l].split(','):
            if i in wordvecm.wv.vocab and j in wordvecm.wv.vocab:
                sum_sim+=wordvecm.similarity(i, j)
    rejected_function_similarity.append(sum_sim/denominator)
train_df['rejected_function_similarity'] = pd.DataFrame(rejected_function_similarity)
sns.distplot(train_df['rejected_function_similarity'])

In [None]:
accepted_product_similarity = []
for l in tqdm_notebook(range(train_df.shape[0])):
    sum_sim = 0
    not_in_vocab=0
    denominator = len(train_df['keywords'][l].split())*len(train_df['accepted_product'][l].split())
    for i in train_df['keywords'][l].split():
        for j in train_df['accepted_product'][l].split():
            if i in wordvecm.wv.vocab and j.strip(',') in wordvecm.wv.vocab:
                sum_sim+=wordvecm.similarity(i, j.strip(','))
    accepted_product_similarity.append(sum_sim/denominator)
train_df['accepted_product_similarity'] = pd.DataFrame(accepted_product_similarity)
sns.distplot(train_df['accepted_product_similarity'])

In [None]:
rejected_product_similarity = []
for l in tqdm_notebook(range(train_df.shape[0])):
    sum_sim = 0
    not_in_vocab=0
    denominator = len(train_df['keywords'][l].split())*len(train_df['rejected_product'][l].split())
    for i in train_df['keywords'][l].split():
        for j in train_df['rejected_product'][l].split():
            if i in wordvecm.wv.vocab and j.strip(',') in wordvecm.wv.vocab:
                sum_sim+=wordvecm.similarity(i, j.strip(','))
    accepted_product_similarity.append(sum_sim/denominator)
train_df['rejected_product_similarity'] = pd.DataFrame(accepted_product_similarity)
sns.distplot(train_df['rejected_product_similarity'])

In [None]:
train_df.head(2)

# Vectorization

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))
tokenized_stop_words = nltk.word_tokenize(' '.join(nltk.corpus.stopwords.words('english')))

In [None]:
class ItemSelector(BaseEstimator, TransformerMixin):
  
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [None]:
class Tokenizer(object):
    def __init__(self):
        self.stemmer = nltk.stem.PorterStemmer()
        self.lemmer = nltk.stem.WordNetLemmatizer()
        
    def _stem(self, token):
        if (token in stop_words):
            return token  # Solves error "UserWarning: Your stop_words may be inconsistent with your preprocessing."
        return self.stemmer.stem(self.lemmer.lemmatize(token))
      
    def __call__(self, line):
        tokens = nltk.word_tokenize(line)
        tokens = (self._stem(token) for token in tokens)  # Stemming
        return list(tokens)

In [None]:
vectorizer = FeatureUnion([  
    ('counts', Pipeline([
        ('selector', ItemSelector(key=['sentences_count','character_count','unique_words_count','words_count',
                                      'words_accepted_function_count','words_rejected_function_count','words_accepted_product_count',
                                      'words_rejected_product_count','character_count','accepted_function_character_count',
                                      'rejected_function_character_count','accepted_product_character_count',
                                      'rejected_product_character_count','accepted_function_similarity','rejected_function_similarity','accepted_product_similarity' ,
                                       'rejected_product_similarity'])),
        ('normalized', MinMaxScaler())
    ])),
    ('text', Pipeline([
        ('selector', ItemSelector(key='text')),
        ('tfidf', TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=300000,
            tokenizer=Tokenizer(),
            stop_words=tokenized_stop_words))
    ])),
    ('keywords', Pipeline([
        ('selector', ItemSelector(key='keywords')),
        ('tfidf', TfidfVectorizer(
            ngram_range=(1, 1),
            max_features=30000))
    ])),
    ('accepted_function', Pipeline([
        ('selector', ItemSelector(key='accepted_function')),
        ('tfidf', CountVectorizer(
            ngram_range=(1, 1),
            token_pattern='[^,]+'))
    ])),
    ('accepted_product', Pipeline([
        ('selector', ItemSelector(key='accepted_product')),
        ('tfidf', CountVectorizer(
            ngram_range=(1, 1),
            token_pattern='[^,]+'))
    ])),
    ('rejected_function', Pipeline([
        ('selector', ItemSelector(key='rejected_function')),
        ('tfidf', CountVectorizer(
            ngram_range=(1, 1),
            token_pattern='[^,]+'))
    ])),
    ('rejected_product', Pipeline([
        ('selector', ItemSelector(key='rejected_product')),
        ('tfidf', CountVectorizer(
            ngram_range=(1, 1),
            token_pattern='[^,]+'))
    ]))
])

# Test

In [None]:
# train_X_vectorized_full = vectorizer.fit_transform(train_df.drop(["id", "html", "target"], axis=1))
# with open('train_X_vectorized_full','wb') as f: pkl.dump(train_X_vectorized_full.pkl, f)
train_X_vectorized_full = np.load('train_X_vectorized_full.pkl',allow_pickle=True)

In [None]:
# with open('vectorizer','wb') as f: pkl.dump(vectorizer.pkl, f)
vectorizer=pickle.load(open("vectorizer.pkl", "rb"))


In [None]:
balance_target={0:1,1:1,2:2}

SVC = LinearSVC(C=0.6,max_iter=2000,class_weight=balance_target,random_state=42)

logistic = LogisticRegression(C=0.6,max_iter=2000,class_weight=balance_target,n_jobs=-1,random_state=42)

boosting = CatBoostClassifier(iterations=1000,class_weights=[1,1,2], random_state=42,)

boosting_lgb = lgb.LGBMClassifier(num_iterations=100, class_weight=balance_target,n_jobs=-1)

extratree = ExtraTreesClassifier(n_estimators=100, n_jobs=-1, class_weight=balance_target,random_state=42)

estimators = [('logistic', logistic),
              ('SVC',SVC),
              ('ET',extratree)]
model_stacked_clf=StackingClassifier(estimators=estimators, final_estimator=boosting_lgb, 
                         stack_method = 'predict', n_jobs=-1,passthrough=True, cv=StratifiedKFold(n_splits=5),verbose=1)

model_stacked_clf.fit(train_X_vectorized_full,train_df['target'])

# Test preproccesing

In [None]:
test_df["text"] = test_df["text"].apply(lambda x: '\n'.join(x))
test_df["keywords"] = test_df["keywords"].apply(lambda x: '\n'.join(x)).replace('', "empty")
test_df["accepted_product"] = test_df["accepted_product"].replace("", "missing_accepted_product")
test_df["accepted_function"] = test_df["accepted_function"].replace("", "missing_accepted_function")
test_df["rejected_product"] = test_df["rejected_product"].replace("", "missing_rejected_product")
test_df["rejected_function"] = test_df["rejected_function"].replace("", "missing_rejected_function")

In [None]:
test_sentences_count = []
for i in tqdm_notebook(test_df['text']):
    test_sentences_count.append(len(nltk.sent_tokenize(i)))
test_df['sentences_count'] = pd.DataFrame(test_sentences_count)

In [None]:
test_words_count = []
test_unique_words_count = []
for i in tqdm_notebook(test_df['text']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    test_words_count.append(len(words))
    test_unique_words_count.append(len(set(words)))
test_df['words_count'] = pd.DataFrame(test_words_count)
test_df['unique_words_count'] = pd.DataFrame(test_unique_words_count)

In [None]:
test_df['character_count'] = test_df['text'].apply(lambda text:len(text))

In [None]:
accepted_function_similarity = []
for l in tqdm_notebook(range(test_df.shape[0])):
    sum_sim = 0
    not_in_vocab=0
    denominator = len(test_df['keywords'][l].split())*len(test_df['accepted_function'][l].split())
    for i in test_df['keywords'][l].split():
        for j in test_df['accepted_function'][l].split(','):
            if i in wordvecm.wv.vocab and j in wordvecm.wv.vocab:
                sum_sim+=wordvecm.similarity(i, j)
    accepted_function_similarity.append(sum_sim/denominator)
test_df['accepted_function_similarity'] = pd.DataFrame(accepted_function_similarity)
# sns.distplot(test_df['accepted_function_similarity'])

In [None]:
rejected_function_similarity = []
for l in tqdm_notebook(range(test_df.shape[0])):
    sum_sim = 0
    not_in_vocab=0
    denominator = len(test_df['keywords'][l].split())*len(test_df['rejected_function'][l].split())
    for i in test_df['keywords'][l].split():
        for j in test_df['rejected_function'][l].split(','):
            if i in wordvecm.wv.vocab and j in wordvecm.wv.vocab:
                sum_sim+=wordvecm.similarity(i, j)
    rejected_function_similarity.append(sum_sim/denominator)
test_df['rejected_function_similarity'] = pd.DataFrame(rejected_function_similarity)
sns.distplot(test_df['rejected_function_similarity'])

In [None]:
words_keywords_count = []
for i in tqdm_notebook(test_df['keywords']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_keywords_count.append(len(words))
test_df['words_keywords_count'] = pd.DataFrame(words_keywords_count)

In [None]:
words_accepted_function_count = []
for i in tqdm_notebook(test_df['accepted_function']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_accepted_function_count.append(len(words))
test_df['words_accepted_function_count'] = pd.DataFrame(words_accepted_function_count)

In [None]:
words_rejected_function_count = []
for i in tqdm_notebook(test_df['rejected_function']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_rejected_function_count.append(len(words))
test_df['words_rejected_function_count'] = pd.DataFrame(words_rejected_function_count)

In [None]:
words_accepted_product_count = []
for i in tqdm_notebook(test_df['accepted_product']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_accepted_product_count.append(len(words))
test_df['words_accepted_product_count'] = pd.DataFrame(words_accepted_product_count)

In [None]:
words_rejected_product_count = []
for i in tqdm_notebook(test_df['rejected_product']):
    words = [word for word in nltk.word_tokenize(i) if word not in punctuation]
    words_rejected_product_count.append(len(words))
test_df['words_rejected_product_count'] = pd.DataFrame(words_rejected_product_count)

In [None]:
accepted_product_similarity = []
for l in tqdm_notebook(range(test_df.shape[0])):
    sum_sim = 0
    not_in_vocab=0
    denominator = len(test_df['keywords'][l].split())*len(test_df['accepted_product'][l].split())
    for i in test_df['keywords'][l].split():
        for j in test_df['accepted_product'][l].split():
            if i in wordvecm.wv.vocab and j.strip(',') in wordvecm.wv.vocab:
                sum_sim+=wordvecm.similarity(i, j.strip(','))
    accepted_product_similarity.append(sum_sim/denominator)
test_df['accepted_product_similarity'] = pd.DataFrame(accepted_product_similarity)
sns.distplot(test_df['accepted_product_similarity'])

In [None]:
rejected_product_similarity = []
for l in tqdm_notebook(range(test_df.shape[0])):
    sum_sim = 0
    not_in_vocab=0
    denominator = len(test_df['keywords'][l].split())*len(test_df['rejected_product'][l].split())
    for i in test_df['keywords'][l].split():
        for j in train_df['rejected_product'][l].split():
            if i in wordvecm.wv.vocab and j.strip(',') in wordvecm.wv.vocab:
                sum_sim+=wordvecm.similarity(i, j.strip(','))
    accepted_product_similarity.append(sum_sim/denominator)
test_df['rejected_product_similarity'] = pd.DataFrame(accepted_product_similarity)
sns.distplot(test_df['rejected_product_similarity'])

In [None]:
test_df['accepted_function_character_count'] = test_df['accepted_function'].apply(lambda text:len(text))
test_df['rejected_function_character_count'] = train_df['rejected_function'].apply(lambda text:len(text))
test_df['accepted_product_character_count'] = test_df['accepted_product'].apply(lambda text:len(text))
test_df['rejected_product_character_count'] = test_df['rejected_product'].apply(lambda text:len(text))

# Getting Results

In [None]:
# test_df_vectorized = vectorizer.transform(test_df.drop(["id","html"],axis=1))
# with open('test_df_vectorized','wb') as f: pkl.dump(test_df_vectorized.pkl, f)
test_df_vectorized = np.load('test_df_vectorized.pkl',allow_pickle=True)

In [None]:
pred = model_stacked_clf.predict(test_df_vectorized)

In [None]:
result = pd.concat([test_df["id"], pd.DataFrame(pred, columns=["target"])], axis=1)

In [None]:
result.to_csv("result.csv", index=False)