## Getting Top Results from Google Search
#### Installing libraries


In [1]:
# Installation scraping, cleaning and text analysis only

import re
import pandas as pd
import numpy as np
import trafilatura
import pprint



In [2]:
# Installation tensorflow + transformers + pipelines
# You need this to summarize the SERP and to run question-answering on the extracted corpus of text 

from transformers import pipeline

### Shooting the query

Here are the parameters that we can use:

* **query** : query string that we want to search for.
* **tld** : tld stands for top level domain which means we want to search our * result on google.com or google.in or some other domain.
* **lang** : lang stands for language.
* **num** : Number of results we want.
* **start** : First result to retrieve.
* **stop** : Last result to retrieve. Use None to keep searching forever.
* **pause** : Lapse to wait between HTTP requests. Lapse too short may cause  Google to block your IP. Keeping significant lapse will make your program slow but its safe and better option.
Return : Generator (iterator) that yields found URLs. 
If the stop parameter is None the iterator will loop forever.

Here is the documentation: https://python-googlesearch.readthedocs.io/en/latest/

In [3]:
uQuery_1 = "medicare gov" # here is where everything begins: we choose two queries that we like to compare
uQuery_2 = "medicare supplement plans"

uNum = 10

def getResults(uQuery, uTLD, uNum, uStart, uStop):
  try: 
      from googlesearch import search 
  except ImportError:  
      print("No module named 'google' found") 
  
  # What are we searching for 
  query = uQuery
  
  # Prepare the data frame to store urls
  d = []

  for j in search(query, tld=uTLD, num=uNum, start=uStart, stop=uStop, pause=2): 
      d.append(j)
      print(j)
  return d

results_1 = getResults(uQuery_1, "com", uNum, 1,uNum)
results_2 = getResults(uQuery_2, "com", uNum, 1,uNum)

https://www.mymedicare.gov/
https://www.mymedicare.gov/registration.aspx
https://www.ehealthmedicare.com/medicare-administration-articles/medicare-gov/
https://www.cms.gov/Medicare/Medicare
https://www.cms.gov/Medicare/Medicare-General-Information/MedicareGenInfo
https://www.ssa.gov/medicare/
https://www.ehealthinsurance.com/medicare/resources/medicare-gov
https://www.usa.gov/medicare
https://twitter.com/medicaregov?lang=en
https://www.medicareadvantage.com/resources/medicare-gov
https://www.medicare.gov/supplements-other-insurance/how-to-compare-medigap-policies
https://www.medicare.gov/supplements-other-insurance
https://www.aarpmedicareplans.com/medicare-education/medicare-supplement-plans.html
https://www.ehealthinsurance.com/medicare/supplement-all/medicare-supplement-plans
https://www.ehealthinsurance.com/medicare/supplement-all/compare-medicare-supplement-plans
https://www.humana.com/medicare/medicare-supplement-plans
https://www.dfs.ny.gov/consumers/health_insurance/supplement_

In [4]:
pd.set_option('display.max_colwidth', -1) # make sure output is not truncated (cols width)
pd.set_option("display.max_rows", 100) # make sure output is not truncated (rows)

def readResults(urls, query):
    # Prepare the data frame to store results
    x = []
    position = 0 # position on the serp

    # Loop items in results
    for page in urls:
       position += 1
       downloaded = trafilatura.fetch_url(page)
       if downloaded is not None: # assuming the download was successful
        result = trafilatura.extract(downloaded, include_tables=False, include_formatting=False, include_comments=False) 
        x.append((page, result, query, position))
    return x

d = readResults(results_1, uQuery_1) # get results from there 1st query
e = readResults(results_2, uQuery_2) # get results from there 2nd query

df_1 = pd.DataFrame(d, columns=('url', 'result', 'query', 'position')) # store data in a data frame
df_2 = pd.DataFrame(e, columns=('url', 'result', 'query', 'position')) # store data in a data frame

df_final = pd.concat([df_1, df_2])
print("total number of articles (before filtering) ",len(df_final))

# Remove rows where result is empty 
df_final['result'].replace(' ', np.nan, inplace=True)
df_final = df_final.dropna(subset=['result'])

# Remove rows where article are less than 200 characters in lenght
df_final = df_final[df_final['result'].apply(lambda x: len(str(x))>200)]


# Reindex df
df_final.index = range(len(df_final.index))

# Set the file name
uQuery = uQuery_1 + "_" + uQuery_2
cleanQuery = re.sub('\W+','', uQuery)
file_name = cleanQuery + ".csv"

# Store data to CSV
df_final.to_csv(file_name, encoding='utf-8', index=True)
print("total number of articles saved on",file_name, len(df_final))



  pd.set_option('display.max_colwidth', -1) # make sure output is not truncated (cols width)


total number of articles (before filtering)  19
total number of articles saved on medicaregov_medicaresupplementplans.csv 17


In [15]:
df_final.to_csv(file_name, encoding='utf-8', index=True)
print(df_final,file_name, len(df_final))

               medicare gov freq  medicare supplement plans freq    query
term                                                                     
suffix         1                  0                               0.45989
jr             1                  0                               0.45989
sr             1                  0                               0.45989
ii             1                  0                               0.45989
iii            1                  0                               0.45989
...           ..                 ..                                   ...
coverage are   0                  1                               0.50000
also eligible  0                  1                               0.50000
under this     0                  1                               0.50000
this state     0                  1                               0.50000
state option   0                  1                               0.50000

[9741 rows x 3 columns] medicaregov_m

## Analyze terms from the corpus of results 

Beautiful visualization of how language differs among search results. [Scattertext](https://github.com/JasonKessler/scattertext) is a tool for finding distinguishing terms in small-to-medium-sized corpora like the one we're using here.

Scattertext presents terms/concepts in an interactive, HTML scatter plot. Points corresponding to terms are selectively labeled so that they don't overlap with other labels or points.

Here is the documentation: https://github.com/JasonKessler/scattertext

In [5]:
# Getting additional hourse power - adding more libraries
!pip install scattertext

%matplotlib inline
import scattertext as st
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

import io
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container { width:98% !important; }</style>"))

nlp = spacy.load('en') # make sure you have the right language here 





### Terms that characterize the SERP 

Corpus characteristicness is the difference in dense term ranks between the words in all web pages and a general English-language frequency list.

In [6]:
df_final['index'] = df_final.index
df_final.groupby('query').apply(lambda x: x.result.apply(lambda x: len(x.split())).sum())
df_final['parsed'] = df_final.result.apply(nlp) # run NER using spaCy

# Turn it into a Scattertext corpus
corpus = (st.CorpusFromParsedDocuments(df_final, 
                                       category_col='query', 
                                       parsed_col='parsed')
          .build()
          .remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True)) # remove stop words in English  

In [7]:
# Terms that appear more frequently in the result corpus that are not common in the English language

list(corpus.get_scaled_f_scores_vs_background().index[:15])

['undoundo',
 'null',
 'emblemhealth',
 'vermaverified',
 'retweetedadministrator',
 'threadthanks',
 'tweets',
 'twitter',
 'medigap',
 'unitedhealthcare',
 'generalverified',
 'hhsverified',
 'silversneakers',
 'retweetedoig',
 'applicantsfirsteligible']

### Most frequent terms 

1.   List item
2.   List item




In [8]:
df_final = corpus.get_term_freq_df()
df_final['query'] = corpus.get_scaled_f_scores(uQuery_1)
df_final.sort_values('query', ascending=False).iloc[:15]

Unnamed: 0_level_0,medicare gov freq,medicare supplement plans freq,query
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
use this,24,0,1.0
twitter,24,0,1.0
information on,24,0,1.0
will use,20,0,0.999999
this to,20,0,0.999999
twitter will,20,0,0.999999
make your,20,0,0.999999
your timeline,20,0,0.999999
timeline,20,0,0.999999
timeline better,20,0,0.999999


In [14]:
print(df_final)

               medicare gov freq  medicare supplement plans freq    query
term                                                                     
suffix         1                  0                               0.45989
jr             1                  0                               0.45989
sr             1                  0                               0.45989
ii             1                  0                               0.45989
iii            1                  0                               0.45989
...           ..                 ..                                   ...
coverage are   0                  1                               0.50000
also eligible  0                  1                               0.50000
under this     0                  1                               0.50000
this state     0                  1                               0.50000
state option   0                  1                               0.50000

[9741 rows x 3 columns]


### Most frequent terms related to Medicare Supplement Insurance

---




In [9]:
df_final = corpus.get_term_freq_df()
df_final['query'] = corpus.get_scaled_f_scores(uQuery_2)
df_final.sort_values('query', ascending=False).iloc[:15]

Unnamed: 0_level_0,medicare gov freq,medicare supplement plans freq,query
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
rider,0,40,1.0
state start,0,37,1.0
state end,0,37,1.0
f,0,32,1.0
pays,0,27,0.999986
supplement plans,0,24,0.999893
company,0,24,0.999893
choose,0,24,0.999893
supplement plan,0,23,0.9998
unitedhealthcare,0,22,0.999636


In [10]:
html = produce_scattertext_explorer(corpus,
                                    category='medicare gov',
                                    category_name='medicare gov',
                                    not_category_name='medicare supplement plans',
                                    width_in_pixels=950,
                                    minimum_term_frequency=2,
                                    term_significance = st.LogOddsRatioUninformativeDirichletPrior())
                                    #transform=st.Scalers.scale)
open("SERP-Visualization.html", 'wb').write(html.encode('utf-8'))
HTML(html)

### Difference between results 1,2,3


Run the cell below if you are interested in comparing what is different from results with position 1,2 and 3 with all the other results. 

This is helpful to understand what terms you need to rank higher for that intent.

In [11]:
df_1['top_result'] = ['Yes' if x <= 3 else 'No' for x in df_1['position']] # add top_result = True when position <=3 

# Remove rows where result is empty 
df_1['result'].replace(' ', np.nan, inplace=True)
df_1 = df_1.dropna(subset=['result'])

df_1['index'] = df_1.index

df_1.groupby('top_result').apply(lambda x: x.result.apply(lambda x: len(x.split())).sum())
df_1['parsed'] = df_1.result.apply(nlp)

# Turn it into a Scattertext corpus
corpus_1 = (st.CorpusFromParsedDocuments(df_1, 
                                       category_col='top_result', 
                                       parsed_col='parsed')
          .build()
          .remove_terms(ENGLISH_STOP_WORDS, ignore_absences=True)) # remove stop words in English 


In [12]:
html_1 = produce_scattertext_explorer(corpus_1,
                                    category='Yes',
                                    category_name='Yes',
                                    not_category_name='No',
                                    width_in_pixels=900,
                                    minimum_term_frequency=3,
                                    term_significance = st.LogOddsRatioUninformativeDirichletPrior())
open("SERP-Visualization_top3.html", 'wb').write(html.encode('utf-8'))
HTML(html_1)

## Summarise Results with BART

### Content summarization

*   List item
*   List item



In [16]:
# Remove rows where article are less than 300 characters in lenght
df_1 = df_1[df_1['result'].apply(lambda x: len(str(x))>300)]


# getting text ready by merging all pages together (no index)
full_body = df_1[['result']].agg(''.join, axis=1).to_string(index=False).strip()

print(full_body) 

with open('output.txt', 'w') as text_file:
    text_file.write(full_body)



SUFFIX None Jr Sr I II III IV V VI VII\nI don't have an email address\nZIP CODE OR CITY\nPART A COVERAGE START DATE\nPART B COVERAGE START DATE\nPart A Start Month Month January February March April May June July August September October November December\nPart A Start Year Year 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021                                                                                                                    

In [None]:
# documentation for summarizer: https://huggingface.co/transformers/main_classes/pipelines.html#summarizationpipeline
summarizer = pipeline('summarization')

# use t5 instead
#summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")


In [None]:
import summarize

summarize.summarize_text(full_body)

In [None]:
from textblob import TextBlob

text = TextBlob(full_body)

In [None]:
text.tags

In [None]:
text.sentiment

In [None]:
text.ngrams(n=3)