### Script to Exctract Plugin Info from Wordpress

In [1]:
from bs4 import BeautifulSoup
import re
import requests
import numpy as np
import pandas as pd
from collections import OrderedDict, defaultdict

In [2]:
## Loop over pages 2-99
## Loop over all plugins on page
## Extract data into one dictionary
## Turn dictionary into dataframe

In [3]:
def get_star_count(plugin_soup):
    '''Count the fields for each plugin where stars are filled or half and sum them up'''
    full = len(re.findall('dashicons-star-filled',str(plugin_soup)))
    half = len(re.findall('dashicons-star-half',str(plugin_soup)))
    return full + (0.5*half)

In [4]:
def get_tested_with_info(plugin_soup):
    '''Check whether tested_with info exists and return not available if not'''
    if plugin_soup.find('span', {'class' : 'tested-with'}) == None:
        return 'n/a'
    else:
        return plugin_soup.find('span', {'class' : 'tested-with'}).get_text()[6:-3]
    

In [5]:
plugin_infos = []

In [6]:
%time
for page in range(2, 100):
    result = requests.get(
        "https://de.wordpress.org/plugins/browse/popular/page/" + str(page) + "/")
    c = result.content
    soup = BeautifulSoup(c, 'html.parser')
    plugins = soup.find_all('article')

    for plugin in plugins:
        plugin_info = {}
        plugin_info['author'] = plugin.find(
            'span', {'class': 'plugin-author'}).get_text()[2:-2]
        plugin_info['plugin_name'] = plugin.find(
            'header', {'class': 'entry-header'}).get_text()[1:-1]
        plugin_info['rating_count'] = plugin.find(
            'div', {'class': 'plugin-rating'}).get_text()[1:-1].split(" ")[0]
        plugin_info['rating_stars'] = get_star_count(plugin)
        plugin_info['description'] = plugin.find(
            'div', {'class': 'entry-excerpt'}).get_text()[1:-1]
        plugin_info['active_installs'] = plugin.find(
            'span', {'class': 'active-installs'}).get_text()[5:-2]
        plugin_info['tested_with'] = get_tested_with_info(plugin)
        plugin_info['last_updated'] = plugin.find(
            'span', {'class': 'last-updated'}).get_text()[6:-2]
        plugin_infos.append(plugin_info)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.05 µs


In [7]:
plugins_data = pd.DataFrame(plugin_infos)

In [8]:
plugins_data = plugins_data[['plugin_name', 'author', 'description', 'rating_count', 
                             'rating_stars', 'active_installs','tested_with', 'last_updated']]

In [12]:
plugins_data.head()

Unnamed: 0,plugin_name,author,description,rating_count,rating_stars,active_installs,tested_with,last_updated
0,UpdraftPlus WordPress Backup Plugin,"UpdraftPlus.Com, DavidAnderson",Backup und Wiederherstellung leicht gemacht. V...,2.83,5.0,1+ Millionen aktive Installationen,Getestet mit 4.9.6,Zuletzt aktualisiert vor 4 Wochen
1,Regenerate Thumbnails,Alex Mills (Viper007Bond),Regenerate the thumbnails for one or more of y...,314.0,4.5,1+ Millionen aktive Installationen,Getestet mit 4.9.6,Zuletzt aktualisiert vor 4 Monaten
2,Page Builder by SiteOrigin,SiteOrigin,"Erstelle responsive Seitenlayouts mit Widgets,...",934.0,5.0,1+ Millionen aktive Installationen,Getestet mit 4.9.6,Zuletzt aktualisiert vor 2 Wochen
3,Advanced Custom Fields,Elliot Condon,"Erweitere WordPress um vielseitige, profession...",1.025,5.0,1+ Millionen aktive Installationen,Getestet mit 4.9.9,Zuletzt aktualisiert vor 3 Monaten
4,Google Analytics Dashboard for WP (GADWP),ExactMetrics team,Connects Google Analytics with your WordPress ...,359.0,4.5,1+ Millionen aktive Installationen,Getestet mit 4.9.6,Zuletzt aktualisiert vor 2 Wochen


In [9]:
plugins_data.to_excel("Info_on_all_wordpress_plugins.xlsx")