In [1]:
import os
import re
import sklearn
import numpy as np
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor, BaggingClassifier, AdaBoostClassifier, VotingClassifier, VotingRegressor
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn import svm
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import sparse
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.naive_bayes import BernoulliNB
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import Doc2Vec

# Classification

# Tfidf and count vectorization

In [10]:
df = pd.read_csv('../csv/train108.csv').sample(frac=1)
test = pd.read_csv('../csv/test108.csv')

X_train_original = np.array(df['whole_text'])
y_train_original = np.array(df['labels'])
y_train_original_mmse = np.array(df['mmse'])

X_test_original = np.array(test['whole_text'])
test_ids = np.array(test['ID'])

In [11]:
bow = TfidfVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original

X_test = bow.transform(X_test_original)

clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-tfidf-rf.txt', index=False, sep=';')

[0 0 0 0 1 1 0 1 0 1 1 1 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0
 1 1 0 0 1 0 1 0 1 0 0]
23


In [12]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original

X_test = bow.transform(X_test_original)

clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-countvectorizer-rf.txt', index=False, sep=';')

[0 0 0 0 1 1 0 1 0 1 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 1 1 0 0 1 0 1 0 1 0 0]
23


In [13]:
bow = TfidfVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original

X_test = bow.transform(X_test_original)

clf_final = BernoulliNB()
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-tfidf-nb.txt', index=False, sep=';')

[0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 0 1 0 0 1 0 1 0 0 0 0]
18


In [14]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original

X_test = bow.transform(X_test_original)

clf_final = BernoulliNB()
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-countvectorizer-nb.txt', index=False, sep=';')

[0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 0 1 0 0 1 0 1 0 0 0 0]
18


In [15]:
bow = TfidfVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original

X_test = bow.transform(X_test_original)

clf_final = MLPClassifier(hidden_layer_sizes=(1024,), max_iter=1000)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-tfidf-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0
 0 0 0 0 1 1 1 0 0 0 0]
19


In [16]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original

X_test = bow.transform(X_test_original)

clf_final = MLPClassifier(hidden_layer_sizes=(1024,), max_iter=1000)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-countvectorizer-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 1 1 1 0
 1 1 0 1 1 1 1 0 1 0 0]
28


# Embedding models

In [2]:
df = pd.read_csv('../csv/train108.csv').sample(frac=1)
test = pd.read_csv('../csv/test108.csv')

X_train_original = np.array(df['doc_text'])
y_train_original = np.array(df['labels'])
y_train_original_mmse = np.array(df['mmse'])

X_test_original = np.array(test['doc_text'])
test_ids = np.array(test['ID'])

In [18]:
#word2vec
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *
glove = Magnitude("../downloads/GoogleNews-vectors-negative300.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

X_train = avg_glove(X_train_original)
y_train = y_train_original
X_test = avg_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [19]:
clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-word2vec-rf.txt', index=False, sep=';')

clf_final = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-word2vec-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0
 1 0 0 1 0 0 1 0 0 0 0]
22
[0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 1 0 0 1 0 0 1 0 0 0 0]
20


In [20]:
#word2vec
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/GoogleNews-vectors-negative300.magnitude")

x = X_train_original

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
X_train = tfidf_glove(X_train_original)
y_train = y_train_original

X_test = tfidf_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [21]:
clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-weighted-word2vec-rf.txt', index=False, sep=';')

clf_final = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-weighted-word2vec-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 1 0 0 1 0 0 1 0 0 0 0]
21
[0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 1 1 0 0 1 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 1 0 0 1 0 0 1 0 0 0 0]
19


In [22]:
#glove
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/glove.6B.100d.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

X_train = avg_glove(X_train_original)
y_train = y_train_original

X_test = avg_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [23]:
clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-glove-rf.txt', index=False, sep=';')

clf_final = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-glove-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0
 0 0 0 1 0 0 1 0 0 0 0]
19
[0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 1 1 1 1 0 1 0 0
 1 0 0 1 0 0 1 0 0 0 0]
20


In [24]:
#glove
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/glove.6B.100d.magnitude")

x = X_train_original

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
X_train = tfidf_glove(X_train_original)
y_train = y_train_original

X_test = tfidf_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [26]:
clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-weighted-glove-rf.txt', index=False, sep=';')

clf_final = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(512, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-weighted-glove-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0
 1 0 0 1 0 0 1 0 0 0 0]
21
[0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 1 0 0 1 0 0 1 0 0 0 0]
17


In [27]:
#fasttext
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/wiki-news-300d-1M.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

X_train = avg_glove(X_train_original)
y_train = y_train_original
X_test = avg_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [28]:
scoring = {
        'acc':'accuracy',
        'f1':'f1_macro'
    }
clf = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(512, ))
res = cross_validate(clf, X_train, y_train, cv=10,  return_train_score=True, scoring=scoring) 
acc = res['test_acc']
f1 = res['test_f1']
print("mlp Accuracy: %0.2f (+/- %0.2f)" % (acc.mean(), acc.std()))
print("mlp f1: %0.2f (+/- %0.2f)" % (f1.mean(), f1.std()))

mlp Accuracy: 0.82 (+/- 0.08)
mlp f1: 0.82 (+/- 0.08)


In [29]:
clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-fasttext-rf.txt', index=False, sep=';')

clf_final = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-fasttext-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 0 0 0 1 0 0 1 0 0 0 0]
18
[0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 1 0 0 1 0 0 1 0 0 0 0]
20


In [30]:
#fasttext
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/wiki-news-300d-1M.magnitude")

x = X_train_original

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
X_train = tfidf_glove(X_train_original)
y_train = y_train_original 

X_test = tfidf_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [31]:
clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-weighted-fasttext-rf.txt', index=False, sep=';')

clf_final = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-weighted-fasttext-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 1 0 0 1 0 0 1 0 0 0 0]
18
[0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 1 0 0 1 0 0 1 0 0 0 0]
19


In [3]:
#elmo
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

X_train = avg_glove(X_train_original)
y_train = y_train_original

X_test = avg_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))





HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [4]:
clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-elmo-rf.txt', index=False, sep=';')

clf_final = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-elmo-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 1 1 1 1 0 1 0 0
 0 0 0 1 0 0 1 0 0 0 0]
21
[0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 0 0 0 1 0 0 1 0 0 0 0]
20


In [5]:
#elmo
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")

x = X_train_original

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
X_train = tfidf_glove(X_train_original)
y_train = y_train_original
x_test = tfidf_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))





HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [6]:
clf_final = RandomForestClassifier(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(x_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-weighted-elmo-rf.txt', index=False, sep=';')

clf_final = MLPClassifier(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(x_test)
print(y_pred)
print(sum(y_pred))
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-classif-weighted-elmo-mlp.txt', index=False, sep=';')

[0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 0 0 0 1 0 0 1 0 0 0 0]
19
[0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 0 0 0 1 0 0 1 0 0 0 0]
18


In [None]:
[0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 0 0
 0 0 0 1 0 0 1 0 0 0 0]

# Regression

# Tfidf and count vectorization

In [7]:
df = pd.read_csv('../csv/train108.csv').sample(frac=1)
test = pd.read_csv('../csv/test108.csv')

X_train_original = np.array(df['whole_text'])
y_train_original = np.array(df['labels'])
y_train_original_mmse = np.array(df['mmse'])

X_test_original = np.array(test['whole_text'])
test_ids = np.array(test['ID'])

In [8]:
print("mean of training data:",y_train_original_mmse.mean())
print("var of training data:", y_train_original_mmse.var())

mean of training data: 23.083333333333332
var of training data: 51.46527777777778


In [9]:
bow = TfidfVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original_mmse

X_test = bow.transform(X_test_original)

clf_final = RandomForestRegressor(n_estimators=1000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-tfidf-rf.txt', index=False, sep=';')

[21.028 19.363 19.379 21.357 20.282 13.489 19.515 19.188 21.132 19.104
 20.781 18.218 19.651 19.666 20.498 17.434 20.412 20.774 22.161 18.456
 20.034 17.858 15.045 21.812 22.034 19.081 21.415 20.154 17.326 20.037
 17.851 20.393 14.796 20.969 19.87  19.257 21.25  19.012 19.786 19.124
 21.877 18.989 19.948 18.995 20.976 19.357 18.392 20.437]
mean: 19.540895833333334  var: 3.1395066349826384


In [10]:
bow = TfidfVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original_mmse

X_test = bow.transform(X_test_original)

clf_final = MLPRegressor(hidden_layer_sizes=(1024,), max_iter=1000)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-tfidf-mlp.txt', index=False, sep=';')

[23.22396607 24.05393434 22.99739546 24.20423308 20.59984462 17.44973396
 30.29208134 25.40182672 23.7587051  20.10167448 21.70227804 21.7181237
 27.78019898 23.73746153 25.0745588  22.549161   22.21114266 25.99136838
 25.09543138 20.13808709 21.8277838  22.05911662 18.69088782 24.43719651
 27.13721625 21.652904   26.01512141 22.89185579 20.88405105 25.4039164
 20.15083167 21.13407986 18.68578428 24.57261269 19.98092132 22.87935821
 26.02129916 22.81869947 21.69249964 24.33764775 24.52426373 22.07274288
 22.96183247 20.28962485 23.36127444 23.85981635 24.64285482 24.98236804]
mean: 23.08437016778949  var: 6.121295670282696


In [11]:
bow = TfidfVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original_mmse

X_test = bow.transform(X_test_original)

clf_final = LinearRegression()
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-tfidf-lr.txt', index=False, sep=';')

[25.45048259 31.2002547  27.79348385 27.24813238 22.74220967  9.09288567
 29.62031917 25.34405974 29.48622394 19.79578578 24.6649664  21.54673566
 30.82866471 26.17315584 29.85774169 23.47093252 24.74345061 30.79619198
 32.45895015 20.42783572 27.14198488 17.03482363 15.37344784 27.58285553
 26.05819165 22.83160719 30.99531989 18.29557837 22.24466674 26.53896752
 21.25297896 20.28623748 13.41358013 25.42263734 18.90478022 22.16515892
 31.42422788 25.90309997 27.65201267 25.66989985 28.10567905 21.30092479
 21.76888914 21.70184226 26.0247695  25.48577556 28.98748467 27.92034687]
mean: 24.58812981785864  var: 24.032193215833303


In [12]:
bow = TfidfVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original_mmse

X_test = bow.transform(X_test_original)

clf_final = svm.SVR()
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-tfidf-svm.txt', index=False, sep=';')

[28.62102418 28.85136733 28.18430809 28.68693069 25.89420067 22.32616727
 27.99209374 26.71766642 28.01547958 25.77394128 26.39011551 24.81147093
 28.72097246 26.74107236 28.21502382 26.42931075 26.19883333 27.60253902
 29.14707612 25.29056168 28.64828231 24.17393935 23.80508594 27.47280757
 27.18020311 26.62095304 28.5561051  25.21536779 23.69195684 27.57086782
 24.84397541 25.87341929 23.85977033 27.37234849 25.67971208 26.208984
 27.75939007 25.99967004 26.5403426  28.05341507 27.24221142 26.34914547
 25.66978492 26.53267205 28.18534826 26.7339995  27.63412157 26.39244761]
mean: 26.67659338058003  var: 2.386795739565862


In [13]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original_mmse

X_test = bow.transform(X_test_original)

clf_final = RandomForestRegressor(n_estimators=1000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-countvectorizer-rf.txt', index=False, sep=';')

[19.133 20.418 20.613 21.255 19.378 13.794 20.874 19.724 20.231 18.771
 20.55  19.452 22.059 20.576 21.081 17.482 19.374 21.335 21.963 19.25
 20.542 19.804 14.929 21.737 21.197 18.112 22.17  19.167 19.355 19.456
 19.181 19.31  16.684 21.058 19.207 20.383 21.459 19.231 19.575 19.605
 21.499 19.271 20.018 19.956 20.295 20.842 19.214 19.844]
mean: 19.800291666666666  var: 2.563135831597222


In [14]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original_mmse

X_test = bow.transform(X_test_original)

clf_final = MLPRegressor((1024,), max_iter=1000)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-countvectorizer-mlp.txt', index=False, sep=';')

[17.02060887 27.77065713 26.65304977 35.60660183 12.87946382 10.19295297
 62.80976709 42.21673482 15.15356409 16.54960387 11.45195894 12.06703264
 33.22582942 33.60642784 39.5544455  27.01194678 20.44297592 31.82155516
 27.17651423 14.49876755 20.25894888 18.92892996 20.4125328  30.98335291
 30.71942743 23.81837339 28.44753243 33.38159316 12.45811652 20.37066125
 16.73057761 16.36881607 14.36422191 29.09821569 13.58474064 17.57067747
 23.79406631 18.74363387 17.23166786 19.55668189 17.92859105 18.08809421
 23.67032715 15.04989582 18.3926459  22.55594181 27.86082262 17.0691623 ]
mean: 23.023931399049804  var: 93.11854104445148


In [15]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original_mmse

X_test = bow.transform(X_test_original)

clf_final = LinearRegression()
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-countvectorizer-lr.txt', index=False, sep=';')

[23.33356664 30.30877072 25.14285005 29.87797459 23.72013165  4.49604871
 50.10883928 36.58370541 24.66703337 21.82643761 23.20873106 20.76764368
 30.57355268 19.04694817 26.51084288 27.67155858 25.29602057 37.16677116
 31.39899152 25.47671442 27.66913108 16.14387125 15.66022711 32.92446057
 27.17201727 21.58217886 30.93322729 14.50440104 20.7648871  27.04196957
 21.59990188 16.56096341  9.0412114  28.37892582 21.24935221 20.62159046
 31.05463716 22.630284   27.82129365 21.51293842 24.41476518 18.50979042
 24.03895025 21.34896734 20.93082672 21.31751705 30.8382623  23.14663882]
mean: 24.512423341723927  var: 52.39346676475787


In [16]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original_mmse

regr = LinearRegression()
scores = cross_val_score(regr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10)
print("linear regression RMSE:",-scores.mean())
print()

linear regression RMSE: 5.287097678521727



In [19]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train_original)
y_train = y_train_original_mmse

regr = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
scores = cross_val_score(regr, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10)
print("linear regression RMSE:",-scores.mean())
print()

linear regression RMSE: 3.47084324459179



# Embedding models

In [2]:
df = pd.read_csv('../csv/train108.csv').sample(frac=1)
test = pd.read_csv('../csv/test108.csv')

X_train_original = np.array(df['whole_text'])
y_train_original = np.array(df['labels'])
y_train_original_mmse = np.array(df['mmse'])

X_test_original = np.array(test['whole_text'])
test_ids = np.array(test['ID'])

In [21]:
#word2vec
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *
glove = Magnitude("../downloads/GoogleNews-vectors-negative300.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

X_train = avg_glove(X_train_original)
y_train = y_train_original_mmse
X_test = avg_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [22]:
clf_final = RandomForestRegressor(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-word2vec-rf.txt', index=False, sep=';')

clf_final = MLPRegressor(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-word2vec-mlp.txt', index=False, sep=';')

[27.6278 26.22   26.3331 27.2032 24.1251 13.0482 27.0483 25.307  26.7375
 22.2441 22.0749 18.9017 26.3544 26.2076 27.4268 20.9427 18.8964 27.372
 28.7723 16.7413 28.3678 15.8033 17.2518 28.0524 28.5875 23.3665 28.3071
 20.1072 23.6839 26.8731 17.4093 19.6079 15.952  27.5747 21.7941 22.3015
 26.6356 24.7533 23.2642 28.5621 22.4904 25.9641 22.1362 22.7265 27.5145
 23.177  27.3725 23.5092]
mean: 23.806877083333333  var: 16.085943473016496
[15.74325671 31.04541919 32.74391188 28.20660022 22.67949258  7.09400227
 28.80457641 23.79914471 18.47643953 27.28219289 14.03827422 18.14358057
 30.55176898 23.80664554 31.10383242 20.31030635 19.36395828 34.56050833
 28.88161841 13.78313287 26.99479905 17.98936802 13.87980854 35.95629771
 27.74627119 27.33588776 30.90104511 13.13633708 25.93098223 30.28798822
 14.93916807 15.62325822 16.37871663 26.39369753 23.67793276 26.32665889
 32.00666093 19.92615949 29.6526657  25.2428434  24.18581003 20.09259741
 23.09319371 18.17654442 23.04596168 23.85895882 

In [23]:
#word2vec
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/GoogleNews-vectors-negative300.magnitude")

x = X_train_original

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
X_train = tfidf_glove(X_train_original)
y_train = y_train_original_mmse

X_test = tfidf_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [24]:
clf_final = RandomForestRegressor(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-weighted-word2vec-rf.txt', index=False, sep=';')

clf_final = MLPRegressor(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-weighted-word2vec-mlp.txt', index=False, sep=';')

[26.5109 28.2381 26.8282 27.9719 20.2253 14.6821 27.2086 20.1042 26.3847
 21.3049 22.942  22.5017 27.8883 27.4986 28.3613 21.1772 20.4582 27.3992
 28.6358 17.4854 27.0657 15.7811 16.7285 26.727  28.0187 26.9211 27.5214
 22.8449 20.8939 26.853  16.9217 23.5316 15.9747 28.5145 19.4344 24.7141
 26.8025 20.6879 24.1911 27.9781 23.8925 25.8035 23.7391 23.8995 27.7421
 26.0449 28.7561 23.3184]
mean: 24.0647625  var: 15.881278873593748
[20.50022818 32.71644168 33.15748628 31.10351373 19.9636521   9.84109367
 29.94851111 22.95430499 26.50620503 20.05670303 11.78195482 23.47373142
 29.77050752 20.82207457 33.76100016 17.9304     24.33703091 33.45544441
 31.00273059 17.89621457 26.00744913 15.07427893 10.53785147 34.05879133
 25.59532981 28.37586042 32.47773226 13.67446111 25.0163853  36.23921623
 15.80843832 17.86926602 18.62645422 26.69970549 23.79896235 27.71179146
 33.53673152 23.01946188 26.26140898 26.82082875 24.49833262 21.7576383
 19.37124013 19.9305141  21.8401854  27.80221852 28.67743

In [25]:
#glove
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/glove.6B.100d.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

X_train = avg_glove(X_train_original)
y_train = y_train_original_mmse

X_test = avg_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [28]:
clf_final = RandomForestRegressor(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-glove-rf.txt', index=False, sep=';')

clf_final = MLPRegressor(n_iter_no_change=50, max_iter=50000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-glove-mlp.txt', index=False, sep=';')

[27.0125 28.206  26.4107 28.4232 22.9973 13.5031 28.1776 27.5729 26.7951
 26.2245 25.5131 16.6334 27.3635 27.1559 27.5733 22.2387 19.1529 27.056
 29.0302 16.6753 28.2167 17.7264 20.9431 29.0126 28.9777 25.0294 28.4659
 19.827  18.0718 27.4169 16.6081 19.9919 16.9277 28.8946 22.984  23.8999
 27.2927 24.4495 22.2257 27.7036 23.0559 26.8565 22.5759 23.0408 27.291
 26.6247 27.9479 24.7239]
mean: 24.38535416666667  var: 17.20319508456597
[30.39428487 29.57087411 25.81028037 30.40474933 17.35494322  7.46016762
 28.04104471 19.92157292 29.4227466  28.51583928 14.28217523 31.88434866
 32.70715054 29.54680075 40.61341111 20.97347064 21.65414812 31.61558185
 44.80028695 26.4643519  26.01638596 13.6111706  25.5503585  37.41915406
 30.92031298 25.7124544  42.68457324 21.73770458 21.73829536 25.08361112
 21.33091776 24.65118849 21.48056689 28.15595291 22.1233893  28.32579147
 41.22714742 32.56921067 23.71465954 23.56379909 16.49452024 25.24772164
 27.58398176 15.42517645 33.01164812 27.25167356 36.

In [29]:
#glove
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/glove.6B.100d.magnitude")

x = X_train_original

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
X_train = tfidf_glove(X_train_original)
y_train = y_train_original_mmse

X_test = tfidf_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [31]:
clf_final = RandomForestRegressor(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-weighted-glove-rf.txt', index=False, sep=';')

clf_final = MLPRegressor(n_iter_no_change=50, max_iter=20000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-weighted-glove-mlp.txt', index=False, sep=';')

[25.4628 27.9056 27.1876 27.2621 19.4166 11.6513 28.5818 26.1225 25.5107
 24.2419 26.6661 15.1175 27.3214 27.7269 27.22   22.6998 19.3509 27.0267
 28.3007 18.5764 26.8666 19.3311 18.4755 27.9357 28.6537 26.2654 28.4165
 21.5433 17.2156 27.5782 15.9002 19.6847 16.2257 27.142  21.6482 22.3155
 27.917  21.1772 21.166  27.0156 24.5154 27.0464 25.2983 21.0292 26.932
 24.7721 28.7546 25.4005]
mean: 23.94940625  var: 18.446534070585937
[22.12889726 26.99771831 28.6250016  21.27404831 15.75234851 19.60183595
 26.78215693 18.93720627 37.88929521 20.01667253  6.22256281 40.04826455
 32.36032004 34.80563599 37.29670835 18.22658013 19.03748543 24.88746015
 46.65748825 34.48092104 25.20345991 13.11716749 26.43041364 34.32381557
 37.74293818 29.13466935 52.22082197 14.93804379 10.84782033 16.19180913
 10.98891698 31.43789385 29.0041963  32.59742885 21.17222264 29.77205378
 42.9089373  25.89606585 21.53272163 28.52717255 33.32080475 18.31188316
 21.7704765  16.8185991  45.97937311 30.94304028 31.7108



In [32]:
#fasttext
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/wiki-news-300d-1M.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

X_train = avg_glove(X_train_original)
y = y_train_original_mmse
X_test = avg_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [33]:
clf_final = RandomForestRegressor(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-fasttext-rf.txt', index=False, sep=';')

clf_final = MLPRegressor(n_iter_no_change=50, max_iter=50000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-fasttext-mlp.txt', index=False, sep=';')

[27.2487 28.396  27.2943 27.8541 21.9934 11.784  27.7404 24.965  25.9957
 22.9048 24.0456 19.0577 27.5824 26.093  28.1004 24.7665 17.7207 26.726
 28.7578 22.0197 27.6576 17.8168 18.4451 28.8464 28.587  25.8902 28.8916
 18.6211 21.4525 26.2319 17.3329 20.6039 15.3619 28.3752 21.8633 26.13
 26.002  20.6207 25.8764 27.9508 24.4974 22.4966 25.1874 23.0558 26.3878
 25.585  28.3105 25.4683]
mean: 24.26233958333333  var: 16.25347234030816
[23.13114349 28.52304778 27.80231483 25.03670623 25.21613991  6.41413038
 32.89206393 26.78212229 19.1825563  25.36874932 16.75650813 17.90969317
 33.4415994  28.55872432 32.94699962 17.91990757 13.72156386 36.39871687
 34.23622667 11.27591043 27.74516503 13.15907774 16.82477136 31.87376862
 28.90455912 23.89135762 35.02076465 15.663727   18.6481814  20.75624922
 10.43526458 18.81820205  8.02159474 28.00280289 13.71913955 24.557964
 31.82310875 19.27138902 24.2673708  25.82136846 18.78633979 18.7379787
 23.01829942 12.14864266 26.69506146 18.46636221 34.7874

In [34]:
#fasttext
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/wiki-news-300d-1M.magnitude")

x = X_train_original

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
X_train = tfidf_glove(X_train_original)
y_train = y_train_original_mmse

X_test = tfidf_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [36]:
clf_final = RandomForestRegressor(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-weighted-fasttext-rf.txt', index=False, sep=';')

clf_final = MLPRegressor(n_iter_no_change=50, max_iter=50000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-weighted-fasttext-mlp.txt', index=False, sep=';')

[27.6347 28.4943 27.5656 27.5597 18.6418 13.0903 27.0444 24.3485 25.3618
 25.4244 23.149  19.2554 28.1654 27.2258 28.5095 22.2691 17.5828 27.3321
 28.9878 20.9583 28.0252 20.3312 19.3579 27.2327 28.3061 24.8461 28.683
 18.9421 21.3008 24.9903 17.5516 22.3339 13.9414 27.8987 21.5222 25.1113
 25.9366 22.1267 21.1187 27.8389 24.8009 23.0535 26.5647 24.2066 27.5814
 27.1709 28.4707 25.9943]
mean: 24.24664791666667  var: 15.994276556662326
[21.67704723 30.87036942 32.05482924 23.85783799 21.35597667  8.72025815
 30.85138984 23.48113909 22.83846036 27.7425067  22.4212616  21.00079719
 31.18380117 27.91639741 28.60518028 18.24228392 16.68781585 35.46283501
 29.38575315 12.32788126 23.9386614  17.29170068 18.49076789 29.19936383
 26.37532461 24.23839507 32.04106901 16.29864057 18.20951537 22.23677731
 12.48193781 25.85612636  8.75895796 22.60886925  9.87462211 26.22616188
 26.9214138  17.93845484 18.12187213 24.5767461  22.9516492  19.60345933
 24.52515898 13.20885495 25.50529129 19.73448775 3

In [3]:
#elmo
from tqdm import tqdm_notebook
from nltk import word_tokenize
from pymagnitude import *


glove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")
def avg_glove(x):
    vectors = []
    for title in tqdm_notebook(x):
        vectors.append(np.average(glove.query(word_tokenize(title)), axis = 0))
    return np.array(vectors)

X_train = avg_glove(X_train_original)
y_train = y_train_original_mmse

X_test = avg_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))





HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [4]:
clf_final = RandomForestRegressor(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-elmo-rf.txt', index=False, sep=';')

clf_final = MLPRegressor(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(X_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-elmo-mlp.txt', index=False, sep=';')

[28.1312 27.1946 26.7422 28.1489 23.4991 11.2258 27.0739 23.5474 27.5716
 23.0396 23.6637 19.7451 27.8756 27.3296 27.6363 23.6215 17.8627 26.6088
 28.9908 19.2187 27.5717 17.508  17.1535 27.3237 27.6698 23.7393 28.8373
 18.6968 19.5908 26.1295 16.8099 18.2889 15.516  28.562  21.5781 28.1392
 27.1082 20.093  24.1373 28.2391 26.5928 26.5634 25.3138 21.2452 27.7015
 25.0368 27.776  24.2731]
mean: 24.081704166666665  var: 18.436942899565974
[25.91492558 32.67417498 31.42840083 28.66325568 24.45810121 10.19165275
 28.95426673 27.16058975 24.03318951 22.71585687 22.61923058 22.71810554
 38.01581625 24.14930762 30.16858462 22.6027473  17.6995533  33.86894929
 31.47215703 20.58383381 24.33339015 16.02535936 13.87984919 33.32007466
 23.99039483 23.68294548 30.19035181 13.28151324 21.55058049 28.13690295
 10.70353019 18.38052156  4.09703198 29.86608636 12.87824439 28.61192671
 27.61973771 27.56477469 25.40775122 25.46572729 25.9252139  22.54842856
 20.19938166 15.16737175 25.42100347 28.712247  

In [3]:
#elmo
from sklearn.feature_extraction.text import TfidfVectorizer
glove = Magnitude("../downloads/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude")

x = X_train_original

tfidf = TfidfVectorizer()
tfidf.fit(x)
# Now lets create a dict so that for every word in the corpus we have a corresponding IDF value
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Same as Avg Glove except instead of doing a regular average, we'll use the IDF values as weights.
def tfidf_glove(df):
    vectors = []
    for title in tqdm_notebook(df):
        glove_vectors = glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis = 0, weights = weights))
    return np.array(vectors)
X_train = tfidf_glove(X_train_original)
y_train = y_train_original_mmse
x_test = tfidf_glove(X_test_original)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=108.0), HTML(value='')))

  index_range = sequence_lengths.new_tensor(torch.arange(0, len(sequence_lengths)))





HBox(children=(FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [5]:
clf_final = RandomForestRegressor(n_estimators=10000,n_jobs=-1)
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(x_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-weighted-elmo-rf.txt', index=False, sep=';')

clf_final = MLPRegressor(n_iter_no_change=50, max_iter=10000,hidden_layer_sizes=(1024, ))
clf_final.fit(X_train, y_train)
y_pred = clf_final.predict(x_test)
print(y_pred)
print("mean:",y_pred.mean()," var:", y_pred.var())
result = pd.DataFrame(data = {'ID':test_ids, 'Prediction':y_pred})
result.to_csv('../csv/results/test_results-regression-weighted-elmo-mlp.txt', index=False, sep=';')

[27.3706 27.8041 26.4957 26.578  25.1518 11.629  27.5352 24.3447 27.1
 24.0636 24.5109 19.1744 27.63   27.8045 26.975  23.0581 17.7163 27.5261
 28.2862 17.0293 26.5085 18.9476 18.6038 27.7584 27.7218 23.9221 28.2839
 21.5488 18.6877 24.9471 17.0193 18.6276 15.0573 28.6642 19.9991 26.9557
 27.4372 22.143  24.6933 27.5088 26.5127 26.2415 24.4385 20.6836 26.4136
 25.78   27.3901 25.7366]
mean: 24.083652083333334  var: 16.98614676207899
[26.02010737 32.37059581 28.58815032 29.2989035  26.09187233  8.69713476
 28.14676967 27.45226733 30.424902   20.68079218 22.04863991 25.10128845
 37.70410736 25.2344056  31.38109564 19.36548801 20.84476827 37.142973
 33.20749251 17.29680202 25.44091692 13.6805731  13.68048228 32.68033684
 21.50557052 21.83260105 32.56148248 14.00650101 20.22172539 31.98072562
 11.48238156 22.16025642  7.38584751 26.48753845 13.0698846  25.97065769
 29.29868857 26.97443311 24.60265832 28.48376486 28.58258679 19.85156649
 19.70058061 13.75173517 26.13457463 27.4496626  33.39