In [1]:
import pandas as pd
import numpy as np

import utils as u

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV


from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2

from gensim.models.fasttext import load_facebook_vectors

In [2]:
#fbkv = load_facebook_vectors('../../semester03/DAT620/datafiles/81/parameters.bin')
fbkv = load_facebook_vectors('../datafiles/80/parameters.bin')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
#df = pd.read_pickle('../pandas/lemma_delivered_merged_df.pkl')
df = pd.read_pickle('../pandas/lemma_delivered_merged_df.pkl')

In [4]:
df['is_nn'] = pd.read_pickle('../pandas/is_nn_full.pkl')

In [5]:
labeled_corpus = df[(df['agg_label'] != -1) & (df['is_nn'] == False)]
unlabeled_corpus = df[(df['agg_label'] == -1) & (df['is_nn'] == False)]

#labeled_corpus = df[(df['agg_label'] != -1)]
#labeled_corpus = df[(df['agg_label'] != -1)]['lemma_delivered']
#target = df[(df['agg_label'] != -1) & (df['agg_label'] < 90)]['agg_label']
#target = df[(df['agg_label'] != -1)]['agg_label']

#unlabeled_corpus = df[(df['agg_label'] == -1)]

In [6]:
train_X, vali_X, train_y, vali_y = train_test_split(
    labeled_corpus,
    labeled_corpus['agg_label'],
    test_size=0.4,
    random_state=1,
    stratify=labeled_corpus['agg_label'])

test_X, validation_X, test_y, validation_y = train_test_split(
    vali_X,
    vali_y,
    test_size=0.5,
    random_state=1,
    stratify=vali_y)

In [7]:
def get_fasttext_tfidf_weighted(ft, train, corpus):

    vectorizer = TfidfVectorizer(**{'lowercase': False, 'max_df': 0.25})
    vectorizer.fit(train)
    vectorized = vectorizer.transform(corpus)
    
    embedding_matrix = np.zeros((vectorized.shape[1], 100))
    
    words = []
    for index, word in enumerate(vectorizer.get_feature_names()):
        words.append(word)
        embedding_matrix[index] = ft.get_vector(word)

    tfidf_weighted_vecs = []
    for index in range(vectorized.shape[0]):
        
        doc_indices = vectorized.getrow(index).indices
        doc_vec = embedding_matrix[doc_indices]
        doc_weights = np.asarray(vectorized.getrow(index).todense()[0,doc_indices]).reshape(-1)
        weighted = np.dot(doc_weights.T, doc_vec) / np.sum(doc_weights)
        tfidf_weighted_vecs.append( weighted )
        
    return tfidf_weighted_vecs


def get_fasttext_tfidf_weighted_chi2xidf(ft, t_X, t_y, train, corpus):

    vectorizer = TfidfVectorizer(**{'lowercase': False, 'max_df': 0.25})
    
    vectorizer.fit(train)
    
    t_X_vec = vectorizer.transform(t_X)
    
    scores, pvals = chi2(t_X_vec, t_y)
    
    vectorizer.idf_ = vectorizer.idf_ * scores
    
    vectorized = vectorizer.transform(corpus)
    
    embedding_matrix = np.zeros((vectorized.shape[1], 100))
    
    words = []
    for index, word in enumerate(vectorizer.get_feature_names()):
        words.append(word)
        embedding_matrix[index] = ft.get_vector(word)

    tfidf_weighted_vecs = []
    for index in range(vectorized.shape[0]):
        
        doc_indices = vectorized.getrow(index).indices
        doc_vec = embedding_matrix[doc_indices]
        doc_weights = np.asarray(vectorized.getrow(index).todense()[0,doc_indices]).reshape(-1)
        weighted = np.dot(doc_weights.T, doc_vec) / np.sum(doc_weights)
        tfidf_weighted_vecs.append( weighted )
        
    return tfidf_weighted_vecs


def get_fasttext_tfidf_weighted_small_vocab(ft, vocab, corpus):

    vectorizer = TfidfVectorizer(**{'lowercase': False, 'max_df': 0.25, 'tokenizer': None, 'vocabulary': vocab})
    vectorized = vectorizer.fit_transform(corpus.values)
    
    embedding_matrix = np.zeros((vectorized.shape[1], 100))
    
    words = []
    for index, word in enumerate(vectorizer.get_feature_names()):
        words.append(word)
        embedding_matrix[index] = ft.get_vector(word)

    tfidf_weighted_vecs = []
    for index in range(vectorized.shape[0]):
        
        doc_indices = vectorized.getrow(index).indices
        doc_vec = embedding_matrix[doc_indices]
        doc_weights = np.asarray(vectorized.getrow(index).todense()[0,doc_indices]).reshape(-1)
        weighted = np.dot(doc_weights.T, doc_vec) / np.sum(doc_weights)
        tfidf_weighted_vecs.append( weighted )
        
    return tfidf_weighted_vecs

def FT_to_matrix(data):
    M = np.zeros((len(data), 100))
    for index in range(len(data)):
        M[index] = data[index]
        
    return M


In [None]:
df['FT_TFIDF_full_vocab'] = get_fasttext_tfidf_weighted(fbkv, df.loc[~df.index.isin(vali_X.index)]['lemma_delivered'], df['lemma_delivered'])

In [8]:
df['FT_TFIDF_lemma_labeled_vocab'] = get_fasttext_tfidf_weighted(fbkv, train_X['lemma_delivered'], df['lemma_delivered'])

In [9]:
df['FT_TFIDF_lemma_labeled_vocab'].to_pickle('../pandas/FT_TFIDF_lemma_labeled_vocab.pkl')

In [None]:
df['FT_TFIDF_lemma_chixidf'] = get_fasttext_tfidf_weighted_chi2xidf(fbkv, train_X['lemma_delivered'], train_y, df.loc[~df.index.isin(vali_X.index)]['lemma_delivered'], df['lemma_delivered'])

In [None]:
df.loc[~df.index.isin(vali_X.index)].shape

In [None]:
vali_X.shape

In [None]:
df.to_pickle('../pandas/lemma_delivered_merged_FT_TFIDF_lemma_full_vocab.pkl')

In [None]:
class_vocabs = u.get_vocabs(train_X['lemma_delivered'], train_y, CountVectorizer(**{'tokenizer': None, 'lowercase': False, 'max_df': 0.25}))
merged_vocabs = u.merge_vocabs_by_score(class_vocabs, train_X['lemma_delivered'], train_y, 
                                  CountVectorizer, {'tokenizer': None, 'lowercase': False, 'max_df': 0.25}, 
                                  chi2, 2000)

vec = CountVectorizer(**{'lowercase': False, 'max_df': 0.25, 'vocabulary': merged_vocabs})
vectorized = vec.fit_transform(df['lemma_delivered'])

counter = 0

indices = []

for index in range(vectorized.shape[0]):
    doc_indices = vectorized.getrow(index).indices
    doc_weights = np.asarray(vectorized.getrow(index).todense()[0,doc_indices]).reshape(-1)
    if np.sum(doc_weights) == 0:
        counter += 1
        indices.append(index)
        
print(counter)
#print(indices)

In [None]:
vec = CountVectorizer(**{'lowercase': False, 'max_df': 0.25, 'vocabulary': merged_vocabs})
vectorized = vec.fit_transform(df['lemma_delivered'])

In [None]:
counter = 0

indices = []

for index in range(vectorized.shape[0]):
    doc_indices = vectorized.getrow(index).indices
    doc_weights = np.asarray(vectorized.getrow(index).todense()[0,doc_indices]).reshape(-1)
    if np.sum(doc_weights) == 0:
        counter += 1
        indices.append(index)
        
print(counter)
#print(indices)

In [None]:
df_dropped = df.drop(df.iloc[indices].index)

In [None]:
df_dropped['FT_TFIDF_train_2000'] = get_fasttext_tfidf_weighted_small_vocab(fbkv, merged_vocabs, df_dropped['lemma_delivered'])

In [None]:
df_dropped['FT_TFIDF_train_2000'].to_pickle('../pandas/FT_TFIDF_train_2000.pkl')

In [None]:
vec = TfidfVectorizer(**{'lowercase': False, 'max_df': 0.25}) 
vec_train_X = vec.fit_transform(train_X['lemma_delivered'])

In [None]:
scores, pvals = chi2(vec_train_X, train_y)
vec.idf_ = scores * vec.idf_

In [None]:
vec_vali_X = vec.transform(vali_X['lemma_delivered'])

In [None]:
svm = LinearSVC(**{'C': 7, 'class_weight': None, 'loss': 'hinge', 'tol': 0.0001, 'max_iter': 5000})
model = CalibratedClassifierCV(svm)

model.fit(vec_train_X, train_y)
svm_preds = model.predict(vec_vali_X)

np.mean(svm_preds == vali_y)

In [None]:
svm = LinearSVC(**{'C': 7, 'class_weight': None, 'loss': 'hinge', 'tol': 0.0001, 'max_iter': 5000})
model = CalibratedClassifierCV(svm)

model.fit(np.stack(df_dropped.loc[df_dropped.index.isin(train_X.index)]['FT_TFIDF_train_2000'].to_numpy()), df_dropped.loc[df_dropped.index.isin(train_X.index)]['agg_label'])
svm_preds = model.predict(np.stack(df_dropped.loc[df_dropped.index.isin(vali_X.index)]['FT_TFIDF_train_2000'].to_numpy()))

np.mean(svm_preds == df_dropped.loc[df_dropped.index.isin(vali_X.index)]['agg_label'])

In [None]:
svm = LinearSVC(**{'C': 7, 'class_weight': None, 'loss': 'hinge', 'tol': 0.0001, 'max_iter': 5000})
model = CalibratedClassifierCV(svm)

model.fit(np.stack(train_X['FT_TFIDF_lemma_full_vocab'].to_numpy()), train_X['agg_label'])
svm_preds = model.predict(np.stack(vali_X['FT_TFIDF_lemma_full_vocab'].to_numpy()))

np.mean(svm_preds == vali_X['agg_label'])

In [None]:
(df_dropped.index.isin(vali_X.index)).sum()

In [None]:
df_dropped.loc[df_dropped.index.isin(vali_X.index)]['agg_label']