In [1]:
import os
import re
import sklearn
import numpy as np
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor, BaggingClassifier, AdaBoostClassifier, VotingClassifier, VotingRegressor
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import sparse
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import BernoulliNB
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Doc2Vec

In [2]:
df = pd.read_csv('../csv/train108.csv').sample(frac=1)
docs = df['doc_text']
y = df['labels']
X = docs

In [5]:
def get_results(X,y):
    values = {}
    scoring = {
        'acc':'accuracy',
        'f1':'f1_macro'
    }
    clf = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(512, ))
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring) 
    acc = res['test_acc']
    f1 = res['test_f1']
    print("mlp Accuracy: %0.2f (+/- %0.2f)" % (acc.mean(), acc.std()))
    print("mlp f1: %0.2f (+/- %0.2f)" % (f1.mean(), f1.std()))
    print()
    values['mlp_acc'] = acc.mean()
    values['mlp_f1'] = f1.mean()

    clf = BernoulliNB()
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring)  
    acc = res['test_acc']
    f1 = res['test_f1']
    print("NB Accuracy: %0.2f (+/- %0.2f)" % (acc.mean(), acc.std()))
    print("NB f1: %0.2f (+/- %0.2f)" % (f1.mean(), f1.std()))
    print()
    values['nb_acc'] = acc.mean()
    values['nb_f1'] = f1.mean()

    
    clf = LogisticRegression(max_iter=1000)
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring) 
    acc = res['test_acc']
    f1 = res['test_f1']
    print("logistic reg Accuracy: %0.2f (+/- %0.2f)" % (acc.mean(), acc.std()))
    print("logistic reg f1: %0.2f (+/- %0.2f)" % (f1.mean(), f1.std()))
    print()
    values['lr_acc'] = acc.mean()
    values['lr_f1'] = f1.mean()

    
    clf = svm.SVC(kernel='poly', degree=2)
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring)  
    acc = res['test_acc']
    f1 = res['test_f1']
    print("svm Accuracy: %0.2f (+/- %0.2f)" % (acc.mean(), acc.std()))
    print("svm f1: %0.2f (+/- %0.2f)" % (f1.mean(), f1.std()))
    print()
    values['svm_acc'] = acc.mean()
    values['svm_f1'] = f1.mean()

    
    clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1)
    res = cross_validate(clf, X, y, cv=10,  return_train_score=True, scoring=scoring)  
    acc = res['test_acc']
    f1 = res['test_f1']
    print("random forest Accuracy: %0.2f (+/- %0.2f)" % (acc.mean(), acc.std()))
    print("random forest f1: %0.2f (+/- %0.2f)" % (f1.mean(), f1.std()))
    print()
    values['rf_acc'] = acc.mean()
    values['rf_f1'] = f1.mean()

    return values
    
    

In [6]:
def get_reg_results(X, y):
    regr = LinearRegression(normalize=True)
#     res = cross_validate(clf, X, y, cv=10)
    scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=10)
    print("linear regression RMSE:",-scores.mean())
    print()
    
    regr = RandomForestRegressor()
#     res = cross_validate(clf, X, y, cv=10) 
    scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=10)
    print("Random Forest reg RMSE:",-scores.mean())
    print()
    
    
    from sklearn import svm
    regr = svm.SVR()
#     res = cross_validate(clf, X, y, cv=10) 
    scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=10)
    print("SVM RMSE:",-scores.mean())
    print()
    
    regr = MLPRegressor(max_iter=10000,learning_rate='constant')
#     res = cross_validate(clf, X, y, cv=10) 
    scores = cross_val_score(regr, X, y, scoring='neg_root_mean_squared_error', cv=10)
    print("MLP RMSE:",-scores.mean())
    print()

In [15]:
#word2vec
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/GoogleNews-vectors-negative300.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)
# x_test = avg_glove(test)

get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


mlp Accuracy: 0.82 (+/- 0.09)
mlp f1: 0.82 (+/- 0.09)

NB Accuracy: 0.75 (+/- 0.09)
NB f1: 0.74 (+/- 0.09)

logistic reg Accuracy: 0.74 (+/- 0.08)
logistic reg f1: 0.74 (+/- 0.08)

svm Accuracy: 0.82 (+/- 0.10)
svm f1: 0.82 (+/- 0.10)

random forest Accuracy: 0.82 (+/- 0.09)
random forest f1: 0.82 (+/- 0.09)



{'mlp_acc': 0.8227272727272729,
 'mlp_f1': 0.8171922521922523,
 'nb_acc': 0.7481818181818182,
 'nb_f1': 0.7423265623265622,
 'lr_acc': 0.739090909090909,
 'lr_f1': 0.7378787878787879,
 'svm_acc': 0.8218181818181819,
 'svm_f1': 0.8182822732822734,
 'rf_acc': 0.8227272727272729,
 'rf_f1': 0.8207264957264957}

In [6]:
#word2vec
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/GoogleNews-vectors-negative300.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


mlp Accuracy: 0.83 (+/- 0.07)
mlp f1: 0.83 (+/- 0.08)

NB Accuracy: 0.76 (+/- 0.12)
NB f1: 0.75 (+/- 0.12)

logistic reg Accuracy: 0.79 (+/- 0.12)
logistic reg f1: 0.78 (+/- 0.12)

svm Accuracy: 0.81 (+/- 0.09)
svm f1: 0.80 (+/- 0.09)

random forest Accuracy: 0.81 (+/- 0.12)
random forest f1: 0.80 (+/- 0.12)



{'mlp_acc': 0.8327272727272726,
 'mlp_f1': 0.82519314019314,
 'nb_acc': 0.759090909090909,
 'nb_f1': 0.7549550449550448,
 'lr_acc': 0.7863636363636364,
 'lr_f1': 0.7835372960372962,
 'svm_acc': 0.8054545454545454,
 'svm_f1': 0.8036557886557885,
 'rf_acc': 0.8054545454545454,
 'rf_f1': 0.8035120435120435}

In [7]:
#glove
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/glove.6B.100d.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)
# x_test = avg_glove(test)

get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


mlp Accuracy: 0.80 (+/- 0.11)
mlp f1: 0.79 (+/- 0.11)

NB Accuracy: 0.67 (+/- 0.08)
NB f1: 0.65 (+/- 0.09)

logistic reg Accuracy: 0.74 (+/- 0.13)
logistic reg f1: 0.74 (+/- 0.13)

svm Accuracy: 0.76 (+/- 0.14)
svm f1: 0.76 (+/- 0.15)

random forest Accuracy: 0.82 (+/- 0.12)
random forest f1: 0.81 (+/- 0.13)



{'mlp_acc': 0.7963636363636364,
 'mlp_f1': 0.7895773670773669,
 'nb_acc': 0.6672727272727274,
 'nb_f1': 0.6512873237873238,
 'lr_acc': 0.74,
 'lr_f1': 0.7364685314685314,
 'svm_acc': 0.759090909090909,
 'svm_f1': 0.7558624708624707,
 'rf_acc': 0.8154545454545454,
 'rf_f1': 0.8125213675213676}

In [8]:
#glove
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/glove.6B.100d.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


mlp Accuracy: 0.84 (+/- 0.10)
mlp f1: 0.84 (+/- 0.11)

NB Accuracy: 0.69 (+/- 0.15)
NB f1: 0.69 (+/- 0.15)

logistic reg Accuracy: 0.70 (+/- 0.10)
logistic reg f1: 0.70 (+/- 0.10)

svm Accuracy: 0.79 (+/- 0.15)
svm f1: 0.78 (+/- 0.15)

random forest Accuracy: 0.81 (+/- 0.13)
random forest f1: 0.81 (+/- 0.13)



{'mlp_acc': 0.8427272727272725,
 'mlp_f1': 0.8371750471750472,
 'nb_acc': 0.6936363636363636,
 'nb_f1': 0.6864485514485514,
 'lr_acc': 0.7027272727272728,
 'lr_f1': 0.6969355644355644,
 'svm_acc': 0.7872727272727273,
 'svm_f1': 0.7838691863691863,
 'rf_acc': 0.8136363636363635,
 'rf_f1': 0.8122649572649572}

In [9]:
#glove
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/glove.840B.300d.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


mlp Accuracy: 0.80 (+/- 0.06)
mlp f1: 0.79 (+/- 0.06)

NB Accuracy: 0.82 (+/- 0.09)
NB f1: 0.81 (+/- 0.09)

logistic reg Accuracy: 0.75 (+/- 0.13)
logistic reg f1: 0.75 (+/- 0.13)

svm Accuracy: 0.80 (+/- 0.12)
svm f1: 0.79 (+/- 0.12)

random forest Accuracy: 0.84 (+/- 0.12)
random forest f1: 0.84 (+/- 0.13)



{'mlp_acc': 0.7954545454545455,
 'mlp_f1': 0.7883888333888335,
 'nb_acc': 0.8154545454545454,
 'nb_f1': 0.8136033411033411,
 'lr_acc': 0.75,
 'lr_f1': 0.7456615606615606,
 'svm_acc': 0.7963636363636364,
 'svm_f1': 0.7939588189588189,
 'rf_acc': 0.8418181818181818,
 'rf_f1': 0.8392385392385391}

In [10]:
#fasttext
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/wiki-news-300d-1M.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)
# x_test = avg_glove(test)

get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


mlp Accuracy: 0.86 (+/- 0.11)
mlp f1: 0.86 (+/- 0.11)

NB Accuracy: 0.83 (+/- 0.12)
NB f1: 0.83 (+/- 0.12)

logistic reg Accuracy: 0.77 (+/- 0.12)
logistic reg f1: 0.76 (+/- 0.12)

svm Accuracy: 0.81 (+/- 0.14)
svm f1: 0.80 (+/- 0.14)

random forest Accuracy: 0.84 (+/- 0.13)
random forest f1: 0.84 (+/- 0.13)



{'mlp_acc': 0.860909090909091,
 'mlp_f1': 0.8577580752580752,
 'nb_acc': 0.8336363636363636,
 'nb_f1': 0.8322649572649572,
 'lr_acc': 0.7681818181818182,
 'lr_f1': 0.7631152181152181,
 'svm_acc': 0.8054545454545454,
 'svm_f1': 0.8038383838383838,
 'rf_acc': 0.8418181818181818,
 'rf_f1': 0.8406487956487956}

In [11]:
#fasttext
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/wiki-news-300d-1M.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


mlp Accuracy: 0.83 (+/- 0.08)
mlp f1: 0.83 (+/- 0.08)

NB Accuracy: 0.81 (+/- 0.15)
NB f1: 0.81 (+/- 0.15)

logistic reg Accuracy: 0.78 (+/- 0.12)
logistic reg f1: 0.77 (+/- 0.12)

svm Accuracy: 0.79 (+/- 0.12)
svm f1: 0.79 (+/- 0.12)

random forest Accuracy: 0.81 (+/- 0.14)
random forest f1: 0.81 (+/- 0.15)



{'mlp_acc': 0.8327272727272728,
 'mlp_f1': 0.8280142080142079,
 'nb_acc': 0.8136363636363637,
 'nb_f1': 0.8125679875679875,
 'lr_acc': 0.7763636363636364,
 'lr_f1': 0.774067599067599,
 'svm_acc': 0.7945454545454546,
 'svm_f1': 0.792082362082362,
 'rf_acc': 0.8127272727272727,
 'rf_f1': 0.8113636363636363}

In [12]:
#elmo
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)
# x_test = avg_glove(test)

get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))



mlp Accuracy: 0.81 (+/- 0.09)
mlp f1: 0.81 (+/- 0.09)

NB Accuracy: 0.79 (+/- 0.13)
NB f1: 0.79 (+/- 0.14)

logistic reg Accuracy: 0.82 (+/- 0.12)
logistic reg f1: 0.82 (+/- 0.12)

svm Accuracy: 0.81 (+/- 0.14)
svm f1: 0.81 (+/- 0.14)

random forest Accuracy: 0.82 (+/- 0.13)
random forest f1: 0.82 (+/- 0.13)



{'mlp_acc': 0.8145454545454545,
 'mlp_f1': 0.8115112665112664,
 'nb_acc': 0.7881818181818182,
 'nb_f1': 0.7856099456099456,
 'lr_acc': 0.8236363636363636,
 'lr_f1': 0.822012432012432,
 'svm_acc': 0.8145454545454545,
 'svm_f1': 0.8126029526029525,
 'rf_acc': 0.8236363636363636,
 'rf_f1': 0.8224747474747474}

In [13]:
#elmo
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_results(x,y)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))



mlp Accuracy: 0.85 (+/- 0.08)
mlp f1: 0.85 (+/- 0.08)

NB Accuracy: 0.79 (+/- 0.14)
NB f1: 0.78 (+/- 0.14)

logistic reg Accuracy: 0.85 (+/- 0.10)
logistic reg f1: 0.85 (+/- 0.10)

svm Accuracy: 0.83 (+/- 0.12)
svm f1: 0.83 (+/- 0.13)

random forest Accuracy: 0.84 (+/- 0.14)
random forest f1: 0.84 (+/- 0.15)



{'mlp_acc': 0.850909090909091,
 'mlp_f1': 0.8478163503163503,
 'nb_acc': 0.7854545454545454,
 'nb_f1': 0.7833255633255634,
 'lr_acc': 0.8509090909090908,
 'lr_f1': 0.8478671328671329,
 'svm_acc': 0.8327272727272726,
 'svm_f1': 0.8300038850038849,
 'rf_acc': 0.8427272727272728,
 'rf_f1': 0.8402564102564101}

In [14]:
df = pd.read_csv('../csv/train108.csv').sample(frac=1)
docs = df['doc_text']
y = df['mmse']
X = docs

In [15]:
#word2vec
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/GoogleNews-vectors-negative300.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)
# x_test = avg_glove(test)

get_reg_results(x,y)

#word2vec
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/GoogleNews-vectors-negative300.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_reg_results(x,y)

#glove
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/glove.6B.100d.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)
# x_test = avg_glove(test)

get_reg_results(x,y)

#glove
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/glove.6B.100d.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_reg_results(x,y)

#glove
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/glove.840B.300d.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_reg_results(x,y)

#fasttext
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/wiki-news-300d-1M.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)
# x_test = avg_glove(test)

get_reg_results(x,y)

#fasttext
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/wiki-news-300d-1M.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_reg_results(x,y)

#elmo
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

x = avg_glove(X)
# x_test = avg_glove(test)

get_reg_results(x,y)

#elmo
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")

x = docs

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
x = tfidf_glove(X)
# x_test = tfidf_glove(test)

get_reg_results(x,y)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


linear regression RMSE: 5.406521775295223

Random Forest reg RMSE: 5.234289316501789

SVM RMSE: 7.00252550820454

MLP RMSE: 5.065448647663843



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


linear regression RMSE: 5.149768098763781

Random Forest reg RMSE: 5.1688211668239985

SVM RMSE: 6.966570331976636

MLP RMSE: 4.728106000379271



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


linear regression RMSE: 15.962882994611814

Random Forest reg RMSE: 4.881603892305163

SVM RMSE: 7.36628077963842





MLP RMSE: 5.4584604422641245



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


linear regression RMSE: 23.92389723077356

Random Forest reg RMSE: 5.176854151995177

SVM RMSE: 7.354001145463114





MLP RMSE: 5.588874347232781



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


linear regression RMSE: 7.019259460628136

Random Forest reg RMSE: 5.141349213688855

SVM RMSE: 7.190926116104729

MLP RMSE: 5.620737409912522



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


linear regression RMSE: 5.085972760797771

Random Forest reg RMSE: 4.6988517154620215

SVM RMSE: 7.42920338887552

MLP RMSE: 4.815987303617524



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))


linear regression RMSE: 5.289475788462676

Random Forest reg RMSE: 5.245529855939969

SVM RMSE: 7.40265455156096

MLP RMSE: 4.834287443452469



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))



linear regression RMSE: 4.806956793257003

Random Forest reg RMSE: 4.738004736807781

SVM RMSE: 6.96667593701756

MLP RMSE: 7.9392103562164476



Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))



linear regression RMSE: 4.644592240580037

Random Forest reg RMSE: 4.853849857758906

SVM RMSE: 7.057633946704486

MLP RMSE: 7.8317692131751375



In [10]:
X

2      okay.there s a little boy and he s getting he ...
36     we ll start with the girl.she s going to the.h...
11     all of the action you see going on.okay.this i...
30     the children are getting into the cookie jar w...
103    well the boy on the chair stool s r is uh fall...
                             ...                        
53     alright.the little boy girl s reaching up ther...
27     hm exc 1963_5292 touching lip.raising arm.lea ...
71     um t takin g some cookies.and f fallin g over....
66     oh yes.a little girl a and the little boy is g...
67     and I will tell you what s g.oh boy.well the l...
Name: doc_text, Length: 108, dtype: object

In [14]:
# bert
from sentence_transformers import SentenceTransformer
df = pd.read_csv('../csv/train108.csv').sample(frac=1)
docs = df['doc_text']
y = df['labels']
X = docs

In [16]:
    model = SentenceTransformer('../downloads/bert-base-nli-mean-tokens/')
    x = model.encode(X)
clf = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1000, ))
# res = cross_validate(clf, X, y, cv=10,  return_train_score=True) 
scores = cross_val_score(clf, x, y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

Accuracy: 0.46 (+/- 0.12)
