In [1]:
import pandas as pd
import glob, os
import matplotlib.pyplot as plt
from transformers import pipeline

from __future__ import unicode_literals
import spacy,en_core_web_sm
from spacy.lang.en import English
from spacy.matcher import Matcher
import textacy
import string
from wordcloud import WordCloud, STOPWORDS 
import numpy as np
import nltk
import locationtagger
from difflib import SequenceMatcher
import pickle



  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
class MlSkillsOne:
    
    def __init__(self):
        pass
        self.all_records_raw_df = self.read_data_individual_topics()
        self.all_records_cleaned_df = self.data_cleaner()
        self.target_phrase = "Jeu de Paume is an excellent art gallery in Paris"        

    def read_data_individual_topics(self):

        path = 'individualTopics_27-01-22/'
        all_records = []

        for fname in glob.glob(path + '*.pickle'):
            obj = pd.read_pickle(fname)
            record = [obj['id'],obj['name'],obj['audience_size'],
                      obj['country'],obj['topic']]
            all_records = all_records + [record]
        
        all_records_df = pd.DataFrame.from_records(all_records)
        all_records_df.columns = ['id','name','audience_size','country','topic']
        
        return all_records_df
    
    def number_of_verb(self, string):
        verbs = []
        pattern = [{'POS': 'VERB', 'OP': '?'},\
               {'POS': 'VERB', 'OP': '+'}]
        doc = textacy.make_spacy_doc(string, lang='en_core_web_sm')
        lists = textacy.extract.matches.token_matches(doc, [pattern])
        for list in lists:
            verbs.append(list.text)
            
        return len(verbs)
    
    def number_letters(self, string):
        return len([i for i in string if i.isalpha()])
    
    def location(self, string):
        place_entity = locationtagger.find_locations(text = string)
        countries = place_entity.countries
        regions = place_entity.regions
        cities = place_entity.cities
        X = countries + regions + cities
        return X

    def similarity(self, row):
        return SequenceMatcher(None, row, "Jeu de Paume is an excellent art gallery in Paris").ratio()
    
    def data_cleaner(self):

        self.all_records_raw_df["name_cleaned"] = self.all_records_raw_df.name\
            .apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))
        """
        self.all_records_raw_df["number_of_verb"] = self.all_records_raw_df.name_cleaned\
            .apply(lambda row: self.number_of_verb(row))
        
        self.all_records_raw_df["number_words"] = self.all_records_raw_df.name_cleaned\
            .apply(lambda row: len(row.split()))
            
        self.all_records_raw_df["number_letters"] = self.all_records_raw_df.name_cleaned\
            .apply(lambda row: self.number_letters(row))
        
        self.all_records_raw_df["number_letters_words_verbs"] = self.all_records_raw_df.number_letters.\
            apply(lambda row: [row]) + self.all_records_raw_df.number_words.apply(lambda row: [row]) + \
            self.all_records_raw_df.number_of_verb.apply(lambda row: [row])
         
        self.all_records_raw_df["name_entity"] =  self.all_records_raw_df.name_cleaned\
            .apply(lambda row: self.location(row))  
        """ 
        self.all_records_raw_df["similarity"] =  self.all_records_raw_df.name_cleaned\
            .apply(lambda row: self.similarity(row))
        
        self.all_records_raw_df["target_phrase"] = "Jeu de Paume is an excellent art gallery in Paris"
        
        return self.all_records_raw_df
        
    def word_cloud(self):
        string = self.all_records_raw_df.name_cleaned
        comment_words = ''
        stopwords = set(STOPWORDS)
        for val in string:

            # typecaste each val to string
            val = str(val)

            # split the value
            tokens = val.split()

            # Converts each token into lowercase
            for i in range(len(tokens)):
                tokens[i] = tokens[i].lower()

            comment_words += " ".join(tokens)+" "

        wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = stopwords,
                    min_font_size = 10).generate(comment_words)

        # plot the WordCloud image                      
        plt.figure(figsize = (8, 8), facecolor = None)
        plt.imshow(wordcloud)
        plt.axis("off")
        plt.tight_layout(pad = 0)

        plt.show()   

In [29]:
class MlSkillsTwo:
    
    def __init__(self):
        pass
        self.instance_data = self.read_instances()
        self.variables = pd.DataFrame(self.read_variables(),\
                                      columns= ('var1','var2','var3','var4','var5'))
        
    def read_instances(self):
        
        with open("data.pickle", "rb") as file:
            instances = pickle.load(file)
            
        return instances    
    
    def read_variables(self):
    
        with open("variables.pickle", "rb") as file:
            variables = pickle.load(file)
            
        return variables
    
    def transform_instance(self):
            elements = []
            instances_list = []
            total_df = pd.DataFrame()
            for instance in range(0, len(self.instance_data[:])):

                elements = []

                for ele in range(len(self.instance_data[instance][0])):
                    elements.append("ele" + str(ele))

                #print(self.instance_data[instance][0])
                #print(elements)
                total_df = pd.DataFrame(self.instance_data[instance], columns=elements)
                instances_list.append(total_df)
                #total_df["var1"] = self.variables[instance]

            return instances_list
    
    
    def concatenation(self):
        instance = []
        for dataframe in range(0,len(self.transform_instance())):
                instance.append(pd.concat([self.transform_instance()[dataframe]['ele0'],\
                                           self.variables], axis=1).values.tolist())
        return instance




In [14]:
to_trantor = MlSkillsTwo()

In [34]:
to_trantor.concatenation()[6][32:33]

[['Born', 90.0, 10.0, 45.0, 25.0, 15.0]]

In [None]:
# category = []
# for ele in range(0, to_trantor.transform_instance()[dataframe].shape[1]):
#     for dataframe in range(0,len(to_trantor.transform_instance())):
#         category.append(pd.concat([to_trantor.transform_instance()[dataframe]['ele'\
#                 + str(ele)], to_trantor.variables], axis=1).values.tolist())
