# FILE WITH THE CREATED FUNCTIONS

In [1]:
!pip install nltk
!pip install unidecode
!pip install wordcloud
!pip install nameparser



In [2]:
#IMPORTS
import re
import nltk

nltk.download('punkt')

import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import operator
import string
import unidecode
import nltk.stem as stemmers
from nltk.corpus import stopwords
import math

# Download stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')
from wordcloud import WordCloud, ImageColorGenerator

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Usuario\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#Function to print json objects in a more visual way
def jprint(obj):
    text = json.dumps(obj, sort_keys=False, indent=4, ensure_ascii=False)
    print(text)

In [4]:
#FUNCTIONS TO CLEAN THE TEXT
def remove_accents(text):
    if text:
        #remove accents from text
        return ' '.join([unidecode.unidecode(w) for w in text.split()])

    # In case there is no text
    return ""

def remove_punctuation_marks(text):
    if text:
        #  Remove the punctuation marks from text 
        translator = str.maketrans(' ', ' ', string.punctuation)
        return(text.translate(translator))
        
    # In case there is no text
    return ""

def text_to_lower_case(text):
    if text:
        # convert text to lower case
        return text.lower()
    # In case there is no text
    return ""

def remove_emojis(text):
    if text:
        # Remove non ascii text 
        return text.encode('ascii', 'ignore').decode('ascii')
    # In case there is no text
    return ""

def remove_multiple_whitespaces(text):
    if text:
        #  remove multiple whitespaces 
        return(re.sub('\s+',' ',text))
    # In case there is no text
    return ""

def remove_text_marks(text):
    if text:
        #Removing some web pages characters that are contained in the text extracted from the api
        text = re.sub('<p style="text-align:justify">', "", text)
        text = re.sub("<p>", "", text)
        text = re.sub("</p>", "", text)
        text = re.sub("<ul>", "", text)
        text = re.sub("<u>", "", text)
        text = re.sub("<li>", "", text)
        text = re.sub("&nbsp", "", text)
        text = re.sub("&ndash", "", text)
        text = re.sub("&rsquo;", "", text)
        text = re.sub("<em>", "", text)
        text = re.sub("</em>", "", text)
        text = re.sub("\n", "", text)
        text = re.sub("\r", "", text)
        text = re.sub("\t", "", text)
        text = re.sub("www", "", text)
        text = re.sub("https", "", text)
        #  replace characters like it\'s by its
        text = re.sub(r"\'", "", text)
        #  replace *, ?, ... by spaces
        text = re.sub(r'[^\w\s]', ' ', text)
        
        return text.strip()
    
    # In case there is no text
    return ""

def split_text_and_numbers(text):
    return text

def remove_alone_numbers(text):
    if text:
        # keep only text
        text = re.sub(r"\d", "", text)
        return text
    
    return ""

def clean_text(text):
    # Apply the different functions in order to clean the text
    text = text_to_lower_case(text)
    text = remove_text_marks(text)
    text = remove_punctuation_marks(text)
    text = remove_accents(text)
    text = remove_emojis(text)
    text = split_text_and_numbers(text)
    text = remove_alone_numbers(text)
    text = remove_multiple_whitespaces(text)
    
    # Return
    return text

In [5]:
def convert_to_nltk_text(data, 
                         text_field):
    # List that will store tokens
    tokens = []
    
    # Fill up the tokens list with the text comming from data[text_field] 
    for text in data[text_field].values:
        tokens.extend(text.split(" "))
    
    # Return nltk.Text object
    return nltk.Text(tokens)

In [6]:
def get_tokens(text):
    # Get the tokens
    if isinstance(text, nltk.Text):
        tokens = text.tokens
    else:
        tokens = text.split(" ")
        
    return tokens

def remove_stopwords(text, 
                     language):
    
    # Get stop words for the given language
    stopwords_list = stopwords.words(language)
    
    # Get the tokens
    tokens = get_tokens(text)
        
    # Remove the words from the text
    cleaned_text = [word for word in tokens if word not in stopwords_list]
    
    # Return cleaned text
    if isinstance(text, nltk.Text):
        output = nltk.Text(cleaned_text)
    else:
        output = " ".join(cleaned_text)
    
    return output

def stem_text(text,
              language):
    #  Create the stemmer
    stemmer = stemmers.SnowballStemmer(language)
    
    # Get the tokens
    tokens = get_tokens(text)
    
    #  Stem each token in text object
    stemmas = [stemmer.stem(token) for token in tokens]
    
    # Return stemmed text
    if isinstance(text, nltk.Text):
        output = nltk.Text(stemmas)
    else:
        output = " ".join(stemmas)
    
    return output

def standardize_text(text,
                     language):
    # Remove the stop words
    standardized_text = remove_stopwords(text, language)
    
    # Stem the text
    standardized_text = stem_text(standardized_text, language)
    
    # Return
    return standardized_text

In [7]:
def plot_text_length_distribution(dataset: pd.DataFrame,
                                  text_field: str):
    # compute the vocabulary size for the given text field 
    vocabulary_size = len(set(dataset[text_field].values))
    print("\nThe vocabulary is composed by {0} words \n".format(vocabulary_size))
    
    # compute the lengths
    lengths = dataset[text_field].str.split(" ").str.len().value_counts()
    
    # Build the figures
    plt.figure(figsize = (16, 5))
    plt.subplot(121)
    
    # Plot the distribution
    plt.bar(x=lengths.keys(), height=lengths.values)
    
    # Assign the title
    plt.title("Distribution of the text length")
    plt.xlabel("Text length")
    plt.ylabel("Number of initiatives")

    # Set the second plot
    plt.subplot(122)
    
    # plot the box plot
    plt.boxplot(x=lengths.keys(), showmeans=True)
    
    # Show
    plt.show()

In [8]:
def get_bagofwords(data, attribute, language):

    STOPWORDS = set(stopwords.words(language))
    
    bag_of_words = {}
    stemmized_bag={}
    for field in data[attribute]:
        if field is not None:#to avoid errors if field is None 
            list_words=list(field.split(" "))

            for text in list_words:
                text=clean_text(text)

                # tokenize the text
                lst_text = text.split()

                # remove stopwords
                lst_text = [x for x in lst_text if x not in STOPWORDS]

                #  Create the stemmer (tip: see the class nltk.stem)
                stemmer = stemmers.SnowballStemmer(language)

                #  Stem each token in text object
                stemmized = [stemmer.stem(x) for x in lst_text]

                # create bag-of-words - for each word the frequency of the word in the corpus
                for w in lst_text:
                    if w not in bag_of_words:
                        bag_of_words[w] = 0
                    bag_of_words[w]+=1

                for w in stemmized:
                    if w not in stemmized_bag:
                        stemmized_bag[w] = 0
                    stemmized_bag[w]+=1
                    
                
    return bag_of_words, stemmized_bag

In [9]:
def plot_wordcloud(title, dic_):
    fig, ax = plt.subplots(1, 1, figsize=(8,5))
    wordcloud = WordCloud(background_color="white",width=1600, height=800)
    wordcloud = wordcloud.generate_from_frequencies(dic_)
    ax.axis("off")     
    ax.imshow(wordcloud, interpolation='bilinear')

    ax.set_title(title)
    plt.tight_layout()
    fig.subplots_adjust(top=0.8)
    plt.show()

In [10]:
#Function to create a barplot
def plot_barplot(bag_of_words):
    frequent_words=sorted(bag_of_words.items(), key=operator.itemgetter(1), reverse=True)

    x, y = zip(*frequent_words[:10])
    plt.barh(x,y)
    plt.xlabel("Count")
    plt.ylabel("Words")
    plt.title("Bar plot of most frequent words")

In [11]:
def tokenize(data):
    
    # Get stop words for the given language
    stop_words = stopwords.words("english")
    
    regex = r'\w+'
    text = re.findall(regex, data)
    # take lowercase and remove stop words
    text = [word.lower() for word in text]
    text = [word for word in text if not word in stop_words]
    return text

In [12]:
class TFIDF:
    def __init__(self, data):
        self.data = data
        self.text = self.tokenize(data)
        self.text_df = self.load_df()
        
# tokenize raw text into words and then remove stop words
    
    def tokenize(self, data):
        regex = r'\w+'
        text = re.findall(regex, data)
        return text
    
# turn list of words into pandas dataframe
    def load_df(self):
        text_df = pd.DataFrame(self.text, columns=['word'])
        text_df = text_df.groupby('word').size().reset_index()
        text_df = text_df.rename(columns={0: 'count'})
        return text_df
    
# calculate term frequency
    def tf(self):
        size = len(self.text)
        self.text_df['tf'] = self.text_df['count'] / size
            
# calculate inverse document frequency
    def idf(self, all_text, document_size):
        self.text_df['idf'] = self.text_df['word'].apply(lambda word: self.count_idf(word, all_text, document_size))
        
# helper for idf
    def count_idf(self, word, all_text, document_size):
        count = 0
        for text in all_text:
            if word in text:
                count = count + 1
        return math.log(document_size / (count + 1))
    
# caculate tfidf together
    def tf_idf(self):
        self.text_df['tfidf'] = self.text_df['tf'] * self.text_df['idf']
        self.text_df = self.text_df.round(3)

In [13]:
def plot_countplot(data, variable, name, percentage):
    sns.set(font_scale=1.4)
    ax=data[variable].value_counts().plot(kind="bar", figsize=(4,4), rot=0, color=list('rgbkymc'))
    total=data[variable].value_counts().sum()
    for p in ax.patches:
        if percentage==True:
            txt = str((p.get_height()/total*100).round(2)) + '%'
            txt_x = p.get_x() 
            txt_y = p.get_height()
        else:
            txt = str(p.get_height().round(2))
            txt_x = p.get_x() 
            txt_y = p.get_height()
        ax.text(txt_x,txt_y,txt)
    ax.tick_params(axis='x', labelrotation = 90)
    plt.xlabel(variable, labelpad=14)
    plt.ylabel("Count of "+name, labelpad=14)
    plt.title("Count of "+name+" "+variable, y=1.02)
    plt.show


In [14]:
def plot_countplot3(data, var1, var2, name, category):
    sns.set(font_scale=1.4)
    ax=data.plot(var1, var2, kind='bar', figsize=(5,5), color=list('rgbkymc'))
    
    for p in ax.patches:
        txt = str(p.get_height().round(2))
        txt_x = p.get_x() 
        txt_y = p.get_height()
        ax.text(txt_x,txt_y,txt)
        
    ax.tick_params(axis='x', labelrotation = 90)
    plt.xlabel(name, labelpad=14)
    plt.ylabel("Count of "+category, labelpad=14)
    plt.title("Count of "+category+" "+name, y=1.02)
    plt.show

In [15]:
#Defining a function that will be used to plot a variable distinguishing the values of another one.
def plot_by_variable(data, var1, var_hue):
    
    ax = sns.catplot(x=var1, hue=var_hue, kind="count",palette="cubehelix", data=data, height=6, aspect=3)
    #ax = sns.catplot(x=var1, hue=var_hue, col_wrap=4, data=data, kind="count", height=2.5, aspect=.8)
    ax.set_xticklabels(rotation=45).set_titles("{col_name} {col_var}") 
    return ax

In [16]:
def func(pct, allvals):
    absolute = int(round(pct/100.*np.sum(allvals)))
    return "{:.1f}%\n({:d})".format(pct, absolute)

def pie_chart(df, science, count):
    labels=science, "Rest"
    sizes=[count, len(df)-count]
    explode=(0, 0.1)
    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, explode=explode, labels=labels, autopct=lambda pct: func(pct, sizes), shadow=True, startangle=90)

    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

    plt.show()
    

In [17]:
def plot_pie(data, variable, title):
    
    values = data[variable]
    v_counts = data[variable].value_counts()
    total = len(values)
    fig = plt.figure()
    plt.figure(figsize = (7,7))
    plt.title(title)

    plt.pie(v_counts, labels=v_counts.index, autopct=lambda pct: func(pct, v_counts));  

In [18]:
#with colors
def funcc(pct, allvals):
    absolute = int(round(pct/100.*np.sum(allvals)))
    return "{:.1f}% ({:d})".format(pct, absolute)
def plot_piee(data, variable, title):
    
    values = data[variable]
    v_counts = data[variable].value_counts()
    total = len(values)
    fig = plt.figure()
    plt.figure(figsize = (7,7))
    plt.title(title)
    mycolors = ["#7FFFD4", "#FFE4C4", "#5F9EA0", "#D2691E", "#8FBC8F"]
    plt.pie(v_counts, labels=v_counts.index, colors=mycolors, autopct=lambda pct: funcc(pct, v_counts))
    plt.show()

In [19]:
def get_people(text):
    people_list=[]
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    #print(tags)
    chunkPattern = r' Chunk0: {<NNP>+<NNP>+}'
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    #print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        people_list.append(exp)

    return people_list