# Political candidate wordcloud - batch version

This notebook takes a csv file with a list of links to candidate websites, scrapes the text, then generates a wordcloud from the text.

The file "url_list.csv" should have urls in the first column, a language code in the second column (either 'en', 'fr', or 'all'), and a filename for saving the image in the third column.

You can upload the file using the file browser in the left panel.

In [1]:
# ENTER THE FILENAME WITH A LIST OF URLS
# this expects the url in the 1st column, language in 2nd, and save name in 3rd
filename = "url_list.csv"

In [2]:
# load packages
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import requests
import string
import pandas as pd

In [3]:
from get_words import get_words

In [4]:
data = pd.read_csv(filename)

In [5]:
# set columns to expected names
data.columns = ['url', 'language', 'save_name']
data['language'] = data['language'].str.lower() # set all language fields to lowercase

In [6]:
# get stopwords
# a list of french stopwords
url = "https://github.com/stopwords-iso/stopwords-fr/raw/master/stopwords-fr.txt"
stopwords_fr = set(requests.get(url).content.decode('utf-8').split('\n'))
stopwords_en = set(STOPWORDS)

In [None]:
for group in data.groupby(['save_name', 'language']):
    save_name = group[0][0]
    language = group[0][1]
    candidate = group[1]
    print(save_name)

    # iterate through multiple urls for the same candidate (if applicable)
    all_words = ""
    all_wordlist = []
    full_wordlist = []
    for i, row in candidate.iterrows():
        words, wordlist, full_list = get_words(row['url'])
        all_words += words
        all_words += " "
        all_wordlist += wordlist
        full_wordlist += full_list

    stopwords = get_stopwords(language, stopwords_en, stopwords_fr)
        
    wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(all_words)
 
    # plot the WordCloud image                      
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.savefig("%s.png" %save_name, dpi = 300)
    plt.show()
    plt.close()