In [None]:
"""
Prediction models for imblaced dataset for word embeddings

Step1:  Import of the pre-trained word2vec model
Step2:  Creation of mean vectors
Step3:  Division of data in train and test set
Step4:  Perform 10-fold Cross Validation 
Step4:  Classification (Logistic Regression, CART, Naive Bayes, Linear SVM)
Step5:  Creat word2vec model from the dataset
Step6:  Creation of mean vectors 
Step7:  Division of data in train and test set
Step8:  Perform 10-fold Cross Validation 
Step9:  Classification (Logistic Regression, CART, Naive Bayes, Linear SVM)
Step10: PCA and plotting of our model

"""

In [None]:
import pandas as pd
import re
import pickle as pkl
import spacy
import el_core_news_sm
nlp = el_core_news_sm.load()
import string 
import nltk
import numpy as np
import zipfile
import gensim

from gensim.models import Word2Vec
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix

from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB

In [None]:
with open('Preprocessed.pkl', 'rb') as handle:
    Preprocessed = pkl.load(handle)

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = next(iter(word2vec.values())).shape

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0) for words in X])

In [None]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
transfomed_label = encoder.fit_transform(Preprocessed.Label)

In [None]:
def model_accuracy(X_train, Y_train):
    
    models = []
    models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', LinearSVC()))

    
    results = []
    names = []
    seed = 7
    scoring = 'accuracy'

    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg,"\n\n")

In [None]:
def prediction(X_train, Y_train, X_test, Y_test):
    
    models = []
    models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', LinearSVC()))
    models.append(('SVM', LinearSVC()))
    
    for name, model in models:
        
        print("Prediction for ",name)
        
        model.fit(X_train, Y_train)
        predictions = model.predict(X_test)
        
        print("Accuracy Score \n",accuracy_score(Y_test, predictions),"\n")
        print("Confusion Matrix \n",confusion_matrix(Y_test, predictions),"\n")
        print("Classification Report \n",classification_report(Y_test, predictions),"\n\n")

# Pre-trained Word Embeddings

In [None]:
#http://vectors.nlpl.eu/repository/

with zipfile.ZipFile("46.zip", "r") as archive:
      stream = archive.open("model.txt")

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=False, unicode_errors='replace')

In [None]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

In [None]:
a = MeanEmbeddingVectorizer(w2v)

transfomed_data = list(Preprocessed.NoLaughTokens_l)

a.fit(transfomed_data,transfomed_label)
vectorizer = a.transform(transfomed_data)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(vectorizer, Preprocessed.Label, test_size=0.3, random_state=123)

In [None]:
model_accuracy(X_train, Y_train)

prediction(X_train, Y_train, X_test, Y_test)

# Our Word Embeddings

In [None]:
model2 = gensim.models.Word2Vec(Preprocessed["NoLaughTokens_l"], min_count = 1, size = 100, window = 5, sg = 1) 

In [None]:
b = dict(zip(model2.wv.index2word, model2.wv.syn0))

In [None]:
mean_m2 = MeanEmbeddingVectorizer(b)

transfomed_data = list(Preprocessed.NoLaughTokens_l)

mean_m2.fit(transfomed_data,transfomed_label)
vectorizer = mean_m2.transform(transfomed_data)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(vectorizer, Preprocessed.Label, test_size=0.3, random_state=123)

In [None]:
model_accuracy(X_train, Y_train)

prediction(X_train, Y_train, X_test, Y_test)

# Dimensionality Reduction and plot of word embeddings

In [None]:
X = model2[model2.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

fig= plt.figure(figsize=(30,25))

plt.scatter(result[:, 0], result[:, 1])
words = list(model2.wv.vocab)

for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()