In [1]:
import pandas as pd
import bs4 as bs
import requests
import os
from collections import Counter
from googlesearch import search
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import en_core_web_sm
from heapq import nlargest
from functools import wraps
import errno
import signal
from time import time
from math import floor

In [2]:
class TimeoutError(Exception):
    pass

def timeout(seconds=10, error_message=os.strerror(errno.ETIME)):
    def decorator(func):
        def _handle_timeout(signum, frame):
            raise TimeoutError(error_message)

        def wrapper(*args, **kwargs):
            signal.signal(signal.SIGALRM, _handle_timeout)
            signal.alarm(seconds)
            try:
                result = func(*args, **kwargs)
            finally:
                signal.alarm(0)
            return result

        return wraps(func)(wrapper)

    return decorator

In [3]:
def get_search_term():
    inp = input('Enter search: ').split()
    inp = [x.lower() for x in inp]
    search_term = ''
    for i in range(0,len(inp)):
        if i != 0: search_term += f'+"{inp[i]}"'
        else: search_term = f'"{inp[i]}"'
    return search_term

In [4]:
def perform_google_search():
    query = get_search_term()
    links = list()
    for link in search(query, tld='com', lang='en', num=50, start=0, stop=50, pause=10.0):
        links.append(link)
    return links

In [5]:
links = perform_google_search()

Enter search: iphone 12


In [6]:
name = floor(time())

In [7]:
@timeout(10)
def get_info():
    info = [['Title', 'Text', 'Link']]
    for i in range(0,len(links)):
        if i ==13:
            pass
        else:
            try:
                print(f'Getting Link No. {i}')
                resp = requests.get(links[i])
                soup = bs.BeautifulSoup(resp.text, 'lxml')
                title = soup.select('h1')[0].text.strip()
                full_text = ' '
                for para in soup.findAll('p'):
                    full_text += '. '+para.getText()
                if len(title) < 200:
                    info.append([title,full_text,links[i]])
                else:
                    pass
            except Exception as e:
                print(f'Could not get {i} because {e}')

    return info

In [8]:
def get_df():
    info = get_info()
    old_df = pd.DataFrame(info)
    df = pd.DataFrame(old_df.values[1:], columns=old_df.iloc[0])
    return df

In [9]:
@timeout(10)
def make_csv():
    df = get_df()
    if not os.path.exists('Reserach'):
        os.makedirs('Research', exist_ok=True)
        df.to_csv(f'Research/{name}.csv', encoding = 'utf-8-sig')
    else:
        df.to_csv(f'Research/{name}.csv', encoding = 'utf-8-sig')

In [10]:
make_csv()

Getting Link No. 0
Getting Link No. 1
Getting Link No. 2
Getting Link No. 3
Getting Link No. 4
Getting Link No. 5
Getting Link No. 6
Getting Link No. 7
Getting Link No. 8
Getting Link No. 9
Getting Link No. 10
Getting Link No. 11
Getting Link No. 12
Getting Link No. 14
Getting Link No. 15
Getting Link No. 16
Getting Link No. 17
Getting Link No. 18
Getting Link No. 19
Getting Link No. 20
Getting Link No. 21
Getting Link No. 22
Getting Link No. 23
Getting Link No. 24
Getting Link No. 25
Getting Link No. 26
Could not get 26 because list index out of range
Getting Link No. 27
Getting Link No. 28
Could not get 28 because STREAM ioctl timeout
Getting Link No. 29
Getting Link No. 30
Getting Link No. 31
Getting Link No. 32
Getting Link No. 33
Getting Link No. 34
Getting Link No. 35
Getting Link No. 36
Getting Link No. 37
Getting Link No. 38
Getting Link No. 39
Getting Link No. 40
Getting Link No. 41
Getting Link No. 42
Getting Link No. 43
Getting Link No. 44
Getting Link No. 45
Getting Link No

In [11]:
df = pd.read_csv(f'Research/{name}.csv')

In [12]:
df.rename(columns={"Unnamed: 0": "Index"}, inplace=True)

In [13]:
df.set_index(df['Index'], inplace=True)

In [14]:
df.drop(['Index'], 1, inplace=True)

In [15]:
def get_summaries(df):
    from string import punctuation
    stop_words = list(STOP_WORDS)
    nlp = en_core_web_sm.load()
    punctuation += '\n\t\xa0 \n '
    summaries = list()
    
    for i in range(0, len(df)):
        print(f'Generating summary for Text No. {i}')
        try:
            text = df['Text'][i] 
            doc = nlp(text)
            tokens = [token.text for token in doc]
            word_freq = dict()
            for word in doc:
                if word.text.lower() not in stop_words:
                    if word.text.lower() not in punctuation:
                        if word.text not in word_freq.keys():
                            word_freq[word.text] = 1
                        else:
                            word_freq[word.text] += 1
            max_freq = max(word_freq.values())

            for word in word_freq.keys():
                word_freq[word] = word_freq[word]/max_freq

            sentence_tokens = [sent for sent in doc.sents]
            sentence_scores = dict()

            for sent in sentence_tokens:
                for word in sent:
                    if word.text.lower() in word_freq.keys():
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_freq[word.text.lower()]
                        else:
                            sentence_scores[sent] += word_freq[word.text.lower()]

            select_length = int(len(sentence_tokens)*0.3)
            summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
            final_summary = [word.text for word in summary]
            summary = ' '.join(final_summary)
            summaries.append(summary)
        except:
            print(f'Error with Text No. {i}')
            summaries.append('Error')
    return summaries

In [16]:
summaries = get_summaries(df)

Generating summary for Text No. 0
Generating summary for Text No. 1
Generating summary for Text No. 2
Generating summary for Text No. 3
Generating summary for Text No. 4
Generating summary for Text No. 5
Generating summary for Text No. 6
Generating summary for Text No. 7
Generating summary for Text No. 8
Generating summary for Text No. 9
Generating summary for Text No. 10
Generating summary for Text No. 11
Generating summary for Text No. 12
Generating summary for Text No. 13
Generating summary for Text No. 14
Error with Text No. 14
Generating summary for Text No. 15
Generating summary for Text No. 16
Generating summary for Text No. 17
Generating summary for Text No. 18
Generating summary for Text No. 19
Generating summary for Text No. 20
Generating summary for Text No. 21
Generating summary for Text No. 22
Generating summary for Text No. 23
Generating summary for Text No. 24
Generating summary for Text No. 25
Generating summary for Text No. 26
Generating summary for Text No. 27
Generat

In [17]:
df['Summary'] = summaries

In [18]:
df.to_csv(f'Research/{name}_With_Summary.csv', encoding = 'utf-8-sig')

In [19]:
print('Process Finished. File is ready.')

Process Finished. File is ready.
