In [0]:
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [0]:
data_url = 'https://raw.githubusercontent.com/TatianaShavrina/hse_ml_m1/master/ensembles/complaints.csv'
data = pd.read_csv(data_url, sep='\t')
y = data["PRODUCT_ID"]
X = data["cleaned_text"]

In [0]:
import re
def normalize(data):
  res = []
  for item in data:
    res.append(' '.join([x for x in item.split() if len(x) > 3]))
  return res

In [0]:
X_norm = normalize(X)

# Voting

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.1, random_state=42)

In [0]:
def the_model(weights):
  clf1 = LogisticRegression(multi_class='multinomial',
                            solver='lbfgs',
                            random_state=1,
                            max_iter=300)
  clf2 = ExtraTreesClassifier(n_estimators=50,
                              random_state=1,
                              criterion='entropy',
                              max_depth=30,
                              min_samples_split=3)
  clf3 = GaussianNB()

  clf4 = MultinomialNB(alpha=0.1, fit_prior=True)

  clf5 = KNeighborsClassifier(n_neighbors=2) 

  eclf = VotingClassifier(
          estimators=[('lr', clf1), 
                      ('etc', clf2), 
                      ('gnb', clf3), 
                      ('mnb', clf4),
                      ('knc', clf5)], 
          voting='hard',
          weights=weights)

  voting = Pipeline([
      ('vect', CountVectorizer(analyzer='word', max_features=450)),
      ('tfidf', TfidfTransformer(sublinear_tf=True)),
      ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
      ('clf', eclf),
      ])
  voting = voting.fit(X_train, y_train)
  predictions = voting.predict(X_test)
  print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
  print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
  print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
  print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

In [124]:
the_model([1,2,1,1,1])

Precision:   0.74
Recall:   0.72
F1-measure:   0.72
Accuracy:   0.72


In [0]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz && gunzip cc.en.300.bin.gz

In [0]:
!pip install fasttext

In [77]:
import fasttext
ft = fasttext.load_model('cc.en.300.bin')




In [0]:
from collections import Counter,defaultdict
def get_embedding(text, model, dim):
    text = text.split()
    words = Counter(text)
    total = len(text)
    vectors = np.zeros((len(words), dim))
    
    for i,word in enumerate(words):
        try:
            v = model[word]
            vectors[i] = v*(words[word]/total) 
        except (KeyError, ValueError):
            continue
    
    if vectors.any():
        vector = np.average(vectors, axis=0)
    else:
        vector = np.zeros((dim))
    
    return vector

In [0]:
dim = 300
X_ft = np.zeros((len(X_norm), dim))
for i in range(len(X_norm)):
    X_ft[i] = get_embedding(X_norm[i], ft, dim)

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_ft, y, test_size=0.1, random_state=42)

In [0]:
def the_model2(weights):
  clf1 = LogisticRegression(multi_class='multinomial',
                            solver='lbfgs',
                            random_state=1,
                            max_iter=300)
  clf2 = ExtraTreesClassifier(n_estimators=50,
                              random_state=1,
                              criterion='entropy',
                              max_depth=30,
                              min_samples_split=3)
  clf3 = GaussianNB()

  clf4 = KNeighborsClassifier(n_neighbors=2) 

  eclf = VotingClassifier(
          estimators=[('lr', clf1), 
                      ('etc', clf2), 
                      ('gnb', clf3), 
                      ('knc', clf4)], 
          voting='hard',
          weights=weights)

  voting = Pipeline([
      # ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
      ('clf', eclf),
      ])
  voting = voting.fit(X_train_2, y_train_2)
  predictions = voting.predict(X_test_2)
  print("Precision: {0:6.2f}".format(precision_score(y_test_2, predictions, average='macro')))
  print("Recall: {0:6.2f}".format(recall_score(y_test_2, predictions, average='macro')))
  print("F1-measure: {0:6.2f}".format(f1_score(y_test_2, predictions, average='macro')))
  print("Accuracy: {0:6.2f}".format(accuracy_score(y_test_2, predictions)))

In [158]:
the_model2([1,1,1,1])

Precision:   0.50
Recall:   0.42
F1-measure:   0.37
Accuracy:   0.43


# Bagging

In [0]:
from sklearn.ensemble import BaggingClassifier

def the_model3(base_clf):
  clf = BaggingClassifier(base_estimator=base_clf,
                          n_estimators=20,
                          max_samples=0.7,
                          max_features=0.8,
                          bootstrap=False,
                          bootstrap_features=False)

  pipeline = Pipeline([
      ('vect', CountVectorizer(analyzer='word', max_features=450)),
      ('tfidf', TfidfTransformer(sublinear_tf=True)),
      ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
      ('clf', clf),
      ])
  voting = pipeline.fit(X_train, y_train)
  predictions = pipeline.predict(X_test)
  print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
  print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
  print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
  print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

In [183]:
clfs = [LogisticRegression(multi_class='multinomial',
                          solver='lbfgs',
                          random_state=1,
                          max_iter=300),
        ExtraTreesClassifier(n_estimators=50,
                            random_state=1,
                            criterion='entropy',
                            max_depth=30,
                            min_samples_split=3),
        GaussianNB(),
        MultinomialNB(alpha=0.1, fit_prior=True),
        KNeighborsClassifier(n_neighbors=2)]

for clf in clfs:
  the_model3(clf)
  print()

Precision:   0.68
Recall:   0.68
F1-measure:   0.68
Accuracy:   0.68

Precision:   0.74
Recall:   0.73
F1-measure:   0.73
Accuracy:   0.73

Precision:   0.61
Recall:   0.61
F1-measure:   0.60
Accuracy:   0.61

Precision:   0.67
Recall:   0.66
F1-measure:   0.66
Accuracy:   0.66

Precision:   0.66
Recall:   0.63
F1-measure:   0.63
Accuracy:   0.63



# Boosting

In [0]:
from sklearn.ensemble import AdaBoostClassifier
def the_model4(base_clf):
  clf = AdaBoostClassifier(base_estimator=base_clf, n_estimators=20)

  pipeline = Pipeline([
      ('vect', CountVectorizer(analyzer='word', max_features=450)),
      ('tfidf', TfidfTransformer(sublinear_tf=True)),
      ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
      ('clf', clf),
      ])
  voting = pipeline.fit(X_train, y_train)
  predictions = pipeline.predict(X_test)
  print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
  print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
  print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
  print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))

In [189]:
clfs = [LogisticRegression(multi_class='multinomial',
                          solver='lbfgs',
                          random_state=1,
                          max_iter=300),
        ExtraTreesClassifier(n_estimators=50,
                            random_state=1,
                            criterion='entropy',
                            max_depth=30,
                            min_samples_split=3),
        GaussianNB(),
        MultinomialNB(alpha=0.1, fit_prior=True),
        DecisionTreeClassifier(criterion='entropy', max_depth=1)]

for clf in clfs:
  the_model4(clf)
  print()

Precision:   0.70
Recall:   0.54
F1-measure:   0.50
Accuracy:   0.55

Precision:   0.72
Recall:   0.71
F1-measure:   0.71
Accuracy:   0.71

Precision:   0.46
Recall:   0.44
F1-measure:   0.43
Accuracy:   0.43

Precision:   0.66
Recall:   0.64
F1-measure:   0.65
Accuracy:   0.64

Precision:   0.61
Recall:   0.60
F1-measure:   0.60
Accuracy:   0.60

