In [1]:
import pandas as pd
import glob, os
import matplotlib.pyplot as plt
from transformers import pipeline

from __future__ import unicode_literals
import spacy,en_core_web_sm
from spacy.lang.en import English
from spacy.matcher import Matcher
import textacy
import string
from wordcloud import WordCloud, STOPWORDS 
import numpy as np
import nltk
import locationtagger
from difflib import SequenceMatcher
import pickle
import gensim
from gensim.models import Word2Vec
from gensim import models

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
class MlSkillsOne:
    
    def __init__(self):
        pass
        self.all_records_raw_df = self.read_data_individual_topics()
        self.all_records_cleaned_df = self.data_cleaner()
        self.target_phrase = "Jeu de Paume is an excellent art gallery in Paris"        

    def read_data_individual_topics(self):

        path = 'individualTopics_27-01-22/'
        all_records = []

        for fname in glob.glob(path + '*.pickle'):
            obj = pd.read_pickle(fname)
            record = [obj['id'],obj['name'],obj['audience_size'],
                      obj['country'],obj['topic']]
            all_records = all_records + [record]
        
        all_records_df = pd.DataFrame.from_records(all_records)
        all_records_df.columns = ['id','name','audience_size','country','topic']
        
        return all_records_df
    
    def number_of_verb(self, string):
        verbs = []
        pattern = [{'POS': 'VERB', 'OP': '?'},\
               {'POS': 'VERB', 'OP': '+'}]
        doc = textacy.make_spacy_doc(string, lang='en_core_web_sm')
        lists = textacy.extract.matches.token_matches(doc, [pattern])
        for list in lists:
            verbs.append(list.text)
            
        return len(verbs)
    
    def number_letters(self, string):
        return len([i for i in string if i.isalpha()])
    
    def location(self, string):
        place_entity = locationtagger.find_locations(text = string)
        countries = place_entity.countries
        regions = place_entity.regions
        cities = place_entity.cities
        X = countries + regions + cities
        return X

    def similarity(self, row):
        return SequenceMatcher(None, row, "Jeu de Paume is an excellent art gallery in Paris").ratio()
    
    def data_cleaner(self):

        self.all_records_raw_df["name_cleaned"] = self.all_records_raw_df.name\
            .apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))
        """
        self.all_records_raw_df["number_of_verb"] = self.all_records_raw_df.name_cleaned\
            .apply(lambda row: self.number_of_verb(row))
        
        self.all_records_raw_df["number_words"] = self.all_records_raw_df.name_cleaned\
            .apply(lambda row: len(row.split()))
            
        self.all_records_raw_df["number_letters"] = self.all_records_raw_df.name_cleaned\
            .apply(lambda row: self.number_letters(row))
        
        self.all_records_raw_df["number_letters_words_verbs"] = self.all_records_raw_df.number_letters.\
            apply(lambda row: [row]) + self.all_records_raw_df.number_words.apply(lambda row: [row]) + \
            self.all_records_raw_df.number_of_verb.apply(lambda row: [row])
         
        self.all_records_raw_df["name_entity"] =  self.all_records_raw_df.name_cleaned\
            .apply(lambda row: self.location(row))  
        """ 
        self.all_records_raw_df["similarity"] =  self.all_records_raw_df.name_cleaned\
            .apply(lambda row: self.similarity(row))
        
        self.all_records_raw_df["target_phrase"] = "Jeu de Paume is an excellent art gallery in Paris"
        
        return self.all_records_raw_df
        
    def word_cloud(self):
        string = self.all_records_raw_df.name_cleaned
        comment_words = ''
        stopwords = set(STOPWORDS)
        for val in string:

            # typecaste each val to string
            val = str(val)

            # split the value
            tokens = val.split()

            # Converts each token into lowercase
            for i in range(len(tokens)):
                tokens[i] = tokens[i].lower()

            comment_words += " ".join(tokens)+" "

        wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = stopwords,
                    min_font_size = 10).generate(comment_words)

        # plot the WordCloud image                      
        plt.figure(figsize = (8, 8), facecolor = None)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad = 0)

        plt.show()   

In [48]:
class Predictor:

    def __init__(self) -> None:
        super().__init__()
        self.instance_data = self.read_instances()
        self.variables_df = self.read_variables()
        self.instances_list = self.transform_instance()
        self.flat_instance = self.concatenation()
        self.corpus = self.flat_instance['element'].map(lambda row: [row]).tolist()


    @staticmethod
    def read_instances() -> list:

        with open("data.pickle", "rb") as file:
            instances = pickle.load(file)

        return instances

    @staticmethod
    def read_variables() -> pd.DataFrame:

        with open("variables.pickle", "rb") as file:
            variables = pickle.load(file)

        return pd.DataFrame(variables, columns=('var1', 'var2', 'var3', 'var4', 'var5'))

    def transform_instance(self) -> list:
        instances_list = []
        for instance in range(0, len(self.instance_data[:])):

            elements = []

            for element in range(len(self.instance_data[instance][0])):
                elements.append("ele" + str(element))
            total_df = pd.DataFrame(self.instance_data[instance], columns=elements)
            instances_list.append(total_df)

        return instances_list

    def concatenation(self) -> pd.DataFrame:
        instance = []
        for instances_df in range(0, len(self.instances_list)):
            for column in self.instances_list[instances_df].columns:
                instance.append(pd.concat([self.transform_instance()[instances_df][column], self.variables_df], axis=1)
                                .rename(columns={column: 'element', 'var1': 'var1',
                                                 'var2': 'var2', 'var3': 'var3', 'var4': 'var4', 'var5': 'var5'})
                                )

        instance = pd.concat(instance)
        final_instance = instance[instance["element"].str.contains("nan") == False].reset_index().drop("index", axis=1)

        return final_instance
    
#     def tagged_document(self):
#         for i, list_of_words in enumerate(self.corpus):
#             yield gensim.models.doc2vec.TaggedDocument(self.corpus, [i])
     
    
#     @staticmethod
#     def model(corpus):
#         data_for_training = list(tagged_document(corpus))
#         model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)
#         model.build_vocab(data_for_training)
#         model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)
#         print(model.infer_vector(corpus))
    



In [49]:
to_trantor = Predictor()

In [None]:
to_trantor.tagged_document

In [None]:
to_trantor.flat_instance.element

In [None]:
((to_trantor.flat_instance.element.apply(lambda x:  len(x.split(' '))) == 1)*1).sum()

In [57]:
def tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

In [58]:
data_for_training = list(tagged_document(to_trantor.corpus))

In [59]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)

In [60]:
model.build_vocab(data_for_training)

In [61]:
model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)

In [62]:
print(model.infer_vector(to_trantor.corpus[89]))

[ 0.01404235 -0.01076202  0.00224154 -0.0122733  -0.01026767 -0.01080231
 -0.00098621 -0.01383078  0.01792398  0.00841621  0.00301563 -0.00473961
 -0.00157053  0.00707933 -0.00764521 -0.01211109  0.00362035 -0.00669797
  0.00892563  0.00897956  0.01361649 -0.00078748  0.021214    0.00093932
 -0.01260184  0.01536596 -0.00510094 -0.01180312 -0.00275389 -0.00702473
 -0.00436177 -0.00317256 -0.00048458 -0.00087576 -0.00447983 -0.00936608
 -0.00803289  0.01026737  0.00045672  0.00395049]


In [76]:
corpus_df = to_trantor.flat_instance.element.apply(lambda x: list(x.split()))

In [84]:
corpus_vectorize = pd.concat([corpus_df.apply(lambda x:(model.infer_vector(x)))\
                              , to_trantor.flat_instance], axis = 1)
corpus_vectorize

Unnamed: 0,element,element.1,var1,var2,var3,var4,var5
0,"[-0.013511683, 0.011431031, 0.0070340713, 0.00...",Mercadillo,90.0,30.0,65.0,20.0,5.0
1,"[-0.0010123005, 0.011090787, 0.005197478, -0.0...",Primark,99.0,25.0,65.0,80.0,25.0
2,"[-0.0033996762, -0.008600545, -0.0040092757, 0...",Donde puedo,97.0,1.0,20.0,20.0,50.0
3,"[-0.012319372, -0.009114322, 0.0065697506, 0.0...",Mercados ambulantes,75.0,65.0,85.0,40.0,90.0
4,"[0.0013197904, -0.0015707306, 0.011561666, 0.0...",No compro,99.0,10.0,96.0,40.0,1.0
...,...,...,...,...,...,...,...
34781,"[-0.002646622, -0.011680683, 0.0013522878, 0.0...",Snooker,99.0,1.0,35.0,10.0,5.0
34782,"[-0.0065622395, 0.00517406, 0.001957188, -0.00...",Zumba,15.0,20.0,80.0,15.0,30.0
34783,"[0.008511874, -0.0074084178, 0.0012616575, 0.0...",Bouzuki,70.0,15.0,85.0,10.0,20.0
34784,"[0.0031171367, 0.0098148165, 0.010872327, -0.0...",Timple,99.0,20.0,70.0,5.0,70.0


In [68]:
# def vectorize(corpus):
#     data_for_training = list(tagged_document(corpus))
#     model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)
#     model.build_vocab(data_for_training)
#     model.train(data_for_training, total_examples=model.corpus_count, epochs=model.epochs)
#     print(model.infer_vector(corpus))

In [69]:
# vectorize(corpus[89])