Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
137 lines (128 sloc) 6.82 KB
<?xml version="1.0" encoding="utf-8" ?>
<Suite Title="On-Page Scrapers" Id="OnPageScrapers" Category="Scraping" SourceUrl="https://github.com/nielsbosma/SeoTools-for-Excel-Connectors/blob/master/_Onpage.xml" HelpUrl="http://seotoolsforexcel.com/connectors-onpage/" HelpText="Documentation">
<RestConnector Id="SocialMetaTags" Title="Social Meta Tags" AvailableInSpider="true">
<Parameters>
<Text Id="Url" Title="URL" Required="true"/>
</Parameters>
<Fetch Url="@(Utils.UrlProperty(Model.Url).Absolute)"/>
<Parse>
<XPath Id="OGTitle" Expr="//meta[@@property='og:title']" Attribute="content"/>
<XPath Id="OGType" Expr="//meta[@@property='og:type']" Attribute="content"/>
<XPath Id="OGUrl" Expr="//meta[@@property='og:url']" Attribute="content"/>
<XPath Id="OGImage" Expr="//meta[@@property='og:image']" Attribute="content"/>
<XPath Id="OGDescription" Expr="//meta[@@property='og:description']" Attribute="content"/>
<XPath Id="OGSiteName" Expr="//meta[@@property='og:site_name']" Attribute="content"/>
<XPath Id="OGPriceAmount" Expr="//meta[@@property='og:price:amount']" Attribute="content" Converter="Double"/>
<XPath Id="OGPriceCurrency" Expr="//meta[@@property='og:price:currency']" Attribute="content"/>
<XPath Id="ArticlePublishedTime" Expr="//meta[@@property='article:published_time']" Attribute="content" Converter="DateTime"/>
<XPath Id="ArticleModifiedTime" Expr="//meta[@@property='article:modified_time']" Attribute="content" Converter="DateTime"/>
<XPath Id="ArticleSection" Expr="//meta[@@property='article:section']" Attribute="content"/>
<XPath Id="ArticleTag" Expr="//meta[@@property='article:tag']" Attribute="content"/>
<XPath Id="FBAdmins" Expr="//meta[@@property='fb:admins']" Attribute="content"/>
<XPath Id="FBPageId" Expr="//meta[@@property='fb:page_id']" Attribute="content"/>
<XPath Id="TwitterCard" Expr="//meta[@@property='twitter:card']" Attribute="content"/>
<XPath Id="TwitterSite" Expr="//meta[@@property='twitter:site']" Attribute="content"/>
<XPath Id="TwitterSiteId" Expr="//meta[@@property='twitter:site:id']" Attribute="content"/>
<XPath Id="TwitterTitle" Expr="//meta[@@property='twitter:title']" Attribute="content"/>
<XPath Id="TwitterDescription" Expr="//meta[@@property='twitter:description']" Attribute="content"/>
<XPath Id="TwitterCreator" Expr="//meta[@@property='twitter:creator']" Attribute="content"/>
<XPath Id="TwitterCreatorId" Expr="//meta[@@property='twitter:creator:id']" Attribute="content"/>
<XPath Id="TwitterImage" Expr="//meta[@@property='twitter:image']" Attribute="content"/>
<XPath Id="TwitterImageWidth" Expr="//meta[@@property='twitter:image:width']" Attribute="content" Converter="Int"/>
<XPath Id="TwitterImageHeight" Expr="//meta[@@property='twitter:image:height']" Attribute="content" Converter="Int"/>
<XPath Id="Schema.orgName" Expr="//meta[@@itemprop='name']" Attribute="content"/>
<XPath Id="Schema.orgDescription" Expr="//meta[@@itemprop='description']" Attribute="content"/>
<XPath Id="SchemaOrgImage" Expr="//meta[@@itemprop='image']" Attribute="content"/>
</Parse>
</RestConnector>
<RestConnector Id="TwitterAccount" Title="Twitter Accounts" AvailableInSpider="true" HelpText="Find twitter.com/accounts on a webpage.">
<Parameters>
<Text Id="Url" Title="URL" Required="true"/>
</Parameters>
<Fetch Url="@Model.Url"/>
<Parse>
<Regex Expr="(?i)(//twitter.com/(?!share\?|intent\/tweet|share|signup)(?:#!/)?(\w+))(?![\s\S]*\1)" Group="2" DefaultValue="">
<Regex Expr=".*" Group="0"/>
</Regex>
</Parse>
</RestConnector>
<RestConnector Id="InstagramAccount" Title="Instagram Accounts" AvailableInSpider="true" HelpText="Find instagram.com/accounts on a webpage.">
<Parameters>
<Text Id="Url" Title="URL" Required="true"/>
</Parameters>
<Fetch Url="@Model.Url"/>
<Parse>
<Regex Expr="(//instagram.com/(?!p\/)(?:#!/)?(\w+))(?![\s\S]*\1)" Group="2" DefaultValue="">
<Regex Expr=".*" Group="0"/>
</Regex>
</Parse>
</RestConnector>
<RestConnector Id="LinkedInAccount" Title="LinkedIn Accounts" AvailableInSpider="true" HelpText="Find linkedin.com/in/accounts on a webpage.">
<Parameters>
<Text Id="Url" Title="URL" Required="true"/>
</Parameters>
<Fetch Url="@Model.Url"/>
<Parse>
<Regex Expr="(linkedin.com\/in\/(?:#!/)?(\w+))(?![\s\S]*\1)" Group="2" DefaultValue="">
<Regex Expr=".*" Group="0"/>
</Regex>
</Parse>
</RestConnector>
<RestConnector Id="Email" Title="Email" AvailableInSpider="true">
<Parameters>
<Text Id="Url" Title="URL" Required="true" Debug.DefaultValue="https://tessin.se/kontakt/"/>
</Parameters>
<Fetch Url="@(Utils.UrlProperty(Model.Url).Absolute)">
<HttpSettings>
<IntervalBetweenRequests RandomFrom="0" RandomTo="0" IfSame="Host"/>
</HttpSettings>
</Fetch>
<Parse>
<Regex Expr="(([A-Z0-9._+-]+@@[A-Z0-9.-]+\.[A-Z]{2,6}))(?![\s\S]*\1)" Group="1" IgnoreCase="true" DefaultValue="">
<Regex Expr=".*" Group="0"/>
</Regex>
</Parse>
</RestConnector>
<RestConnector Id="Emails.CSV" Title="Emails CSV" AvailableInSpider="true" HelpText="Returns all emails separated by commas">
<Parameters>
<Text Id="Url" Title="URL" Required="true" Debug.DefaultValue="https://tessin.se/kontakt/"/>
</Parameters>
<Fetch Url="@(Utils.UrlProperty(Model.Url).Absolute)">
</Fetch>
<Parse>
<Compute Id="All" Title="All (comma-separated)" DefaultValue="">
<Compute.Expr>
<![CDATA[
@{
string search = @"(([A-Z0-9._+-]+@[A-Z0-9.-]+\.[A-Z]{2,6}))(?![\s\S]*\1)";
MatchCollection matches = Regex.Matches(Model.FetchedResult.Body, search, RegexOptions.IgnoreCase);
string output = string.Join(",", matches.Cast<Match>().Select(m => m.Groups[0].Value));
}
@output
]]>
</Compute.Expr>
</Compute>
</Parse>
</RestConnector>
<RestConnector Id="GoogleAnalyticsId" Title="Google Analytics Id" AvailableInSpider="true">
<Parameters>
<Text Id="Url" Title="URL" Required="true"/>
</Parameters>
<Fetch Url="@(Utils.UrlProperty(Model.Url).Absolute)"/>
<Parse>
<Regex Expr="(\bUA-\d{4,10}-\d{1,4}\b)" IgnoreCase="true" DefaultValue=""/>
</Parse>
</RestConnector>
<RestConnector Id="GoogleAdSenseId" Title="Google AdSense Id" AvailableInSpider="true">
<Parameters>
<Text Id="Url" Title="URL" Required="true"/>
</Parameters>
<Fetch Url="@Utils.UrlProperty(Model.Url).Absolute"/>
<Parse>
<Try>
<Regex Expr="data-ad-client\s*=\s*&quot;([^&quot;]*)&quot;" IgnoreCase="true"/>
<Regex Expr="google_ad_client\s*=\s*&quot;([^&quot;]*)&quot;" IgnoreCase="true"/>
</Try>
</Parse>
</RestConnector>
</Suite>
You can’t perform that action at this time.