In [3]:
from collections import Counter


class LexiconMapper:
    
    
    def remove_repeatings(self, word):
        prev_char = ''
        new_word = []
        for char in word:
            if char == prev_char:
                continue
            new_word.append(char)
            prev_char = char
        return ''.join(new_word)
    
    def map_tweets(self, tweets, dict_lex, value_index, convert_float):
        

        mapped_tweets=[]
        for sentence in tweets.text:
            mapped_sentence=[]
            for word in sentence.split():
                new_word = self.remove_repeatings(word)
                
                if word in dict_lex.keys():
                    if convert_float:
                        mapped_sentence.append(float(dict_lex[word][value_index]))
                    else:
                        mapped_sentence.append(dict_lex[word][value_index])
                        
                elif new_word in dict_lex.keys():
                    if convert_float:
                        mapped_sentence.append(float(dict_lex[new_word][value_index]))
                    else:
                        mapped_sentence.append(dict_lex[new_word][value_index])
                else:
                    mapped_sentence.append(-1)
            mapped_tweets.append(mapped_sentence)
        
        tweets['mapped'] = mapped_tweets
        return tweets
    
    
    def get_sample(self, dataframe, attribute, value, size):
        
        return dataframe[attribute == value].sample(size, random_state=23)
    
    
    def add_labels(self, tweets, index, low_bound, high_bound):
        
        classes = []
        for i in tweets.values[:,index]:
            if i > high_bound:
                classes.append("high")
            elif i > low_bound and i <= high_bound:
                classes.append("medium")
            elif i < low_bound and i > 0:
                classes.append("low")
            else:
                classes.append("none")
        
        tweets['values'] = classes
        
        return tweets
    
    
    
    def filter_tweets(self, tweets, filter_value, percent):
        filtered = []
        
        for tweet in tweets.mapped:
            t_len = len(tweet)
            
            not_found = 0
            for el in tweet:
                if int(el) == -1:
                    not_found = not_found + 1
            if float(not_found/t_len) < float(percent):
                filtered.append(True)
            else:
                filtered.append(False)
                
                
        tweets['filtered'] = filtered
        
        return tweets
                        
    
    def get_maximum(self, mapped_tweets):
        maximums = []
        for tweet in mapped_tweets:
            maximum = max(tweet)
            maximums.append(maximum)
            
        return maximums
            
        
    def get_average(self, mapped_tweets):
        averages = []
        for tweet in mapped_tweets:
            filtered_not_founds = [num for num in tweet if num != -1]
            average = round(sum(filtered_not_founds)/len(filtered_not_founds), 2)
            averages.append(average)
            
        return averages
    

class Analyzer:
    
    def get_length_statistics(self, tweets):
        
        lengths = [len(tweet) for tweet in tweets.mapped]
        return Counter(lengths)
    
    
    def labels_distribution(self, aggregate, low_bound, high_bound):
        
        aggregate = Counter(aggregate)
        low = 0
        high = 0
        med = 0
        for i in aggregate.keys():
            if i < low_bound:
                low = low + aggregate[i]
            if i > high_bound:
                high = high + aggregate[i]
            else:
                med = med + aggregate[i]
                
        return low, med, high
    
    
        