In [277]:
# %load header.py
import numpy as np                # numeric calculations and handling missing values
import pandas as pd               # pushing data around
# import matplotlib.pyplot as plt   # making nice pictures
# import seaborn as sns             # making nicer pictures more easily
#import missingno as msno          # nice visualization of missing values
import sys
import os                         # accessing files in bulk
# import re                         # using regular expressions
# import statsmodels.api as sm      # statistics...
# import fbprophet as prophet       # forecasting
# from pylab import rcParams

pd.set_option('display.max_columns', 500)    # seeing all columns
# sns.set(style="darkgrid")                    # nice grids

In [353]:
import requests
from bs4 import BeautifulSoup
import random
from time import sleep
import csv

In [365]:
class related_search_hub:
    def __init__(self, start_search, depth_to_go=1, wait_time=random.uniform(.25,.75)):
        self.initial_search = start_search
        self.standard_wait = wait_time
        self.initial_query = '/hashtag/' + start_search[1:] if start_search[0] == '#' else '/search?q=' + start_search
        self.depth_to_go = depth_to_go
        self.level = 1
        self.error_list = []

    def get_results(self, query=False, search=False):
        query = query if query else self.initial_query
        search = search if search else self.initial_search
        print("Scraping https://twitter.com" + query + ".")
        results, new_queries = self.get_related_searches(query)
        if results:
            print(f"""Level {self.level} search for {search} yielded the related searches {results}.""")
            result_dict = dict(zip(
                ['type', 'level', 'search', 'result_1', 'result_2', 'result_3', 'result_4', 'result_5'],
                [('result', 'query'), self.level, (search, query)] + list(zip(results, new_queries)))
                              )
            if self.level == 1:
                self.results_df = pd.DataFrame(result_dict)
            else:
                self.results_df = pd.concat([self.results_df, pd.DataFrame(result_dict)], ignore_index=True, sort=False)
        else:
            print(f"""Level {self.level} search for {search} yielded an error.""")

    def go_deeper(self):
        result_matrix = self.results_df.loc[(self.results_df.level==self.level) & (self.results_df.type=='result'),'result_1':]
        query_matrix = self.results_df.loc[(self.results_df.level==self.level) & (self.results_df.type=='query'),'result_1':]
        self.level += 1
        self.depth_to_go -= 1
        rows = result_matrix.shape[0]
        earlier_searches_1 = self.results_df.loc[self.results_df.type=='result', 'search'].values
        earlier_searches_2 = []
        print('----- going deeper -----')
        for i in range(rows):
            R = result_matrix.iloc[i,:].dropna()
            Q = query_matrix.iloc[i,:].dropna()
            columns = R.shape[0]
            for j in range(columns):
                if (R[j] in earlier_searches_1) or (R[j] in earlier_searches_2):
                    pass
                else:
                    self.get_results(query=Q[j], search=R[j])
                    earlier_searches_2.append(R[j])
                    sleep(self.standard_wait)
        print(f'----- finished level {self.level} -----')

    def get_related_searches(self, query, waittime=random.uniform(.25,.75)):
        r = requests.get('http://www.twitter.com' + query)
        similar_results_bucket = BeautifulSoup(r.text, 'html.parser').find_all('ul', class_="AdaptiveRelatedSearches-items")
        if (similar_results_bucket and (similar_results_bucket[0].find_all('a', class_='js-nav'))):
            results = similar_results_bucket[0].find_all('a', class_='js-nav')
            names = [x.decode_contents().strip().replace('<strong>','').replace('</strong>','') for x in results]
            links = [x.attrs['href'][:-9] for x in results]
            return (names,links)
        else:
            print(f'----- error loading http://www.twitter.com{query} -----')
            self.error_list.append(query)
            return([], [])

    
    def results_as_csv(self, only_results=False):
        search_string = self.initial_search.replace(" ", "_")
        if not only_results:
            print('----- saving full csv -----')
            self.results_df.to_csv(f'results_{search_string}_depth{self.level}.csv', index=False)
        else:
            print('----- saving results only csv -----')
            self.results_df.loc[self.results_df.type=='result', 'level':].to_csv(f'results_only_{search_string}_depth{self.level}.csv', index=False)
            
    def export_gephi_csv(self):
        search_string = self.initial_search.replace(" ", "_")
        self.results_df.loc[self.results_df.type=='result','search':].to_csv(
            f'gephi_{search_string}_depth{self.level}.csv',
            index=False,
            header=False,
            sep=";",
            quoting=csv.QUOTE_ALL
            )
    
    def full_descent(self, with_output=False, only_results_output=False, with_gephi=False):
        self.get_results()
        self.depth_to_go -= 1
        while self.depth_to_go > 0:
            self.go_deeper()
        if self.error_list:
            print('----- there were loading errors -----')
            for x in self.error_list:
                  print(f'----- error while loading https://www.twitter.com/{x}  ----- ')
        else: pass
        if with_output:
            self.results_as_csv(only_results=only_results_output)
        if with_gephi:
            self.export_gephi_csv()
        else: pass
        print('----- alle done -----')

In [366]:
test_search = related_search_hub('#rezoeffekt', depth_to_go=2)

In [367]:
test_search.full_descent()

Scraping https://twitter.com/hashtag/rezoeffekt.
Level 1 search for #rezoeffekt yielded the related searches ['#annegeht', '#artikel5', '#volkspartei', '#r2g', '#niemehrcducsu'].
----- going deeper -----
Scraping https://twitter.com/hashtag/annegeht.
Level 2 search for #annegeht yielded the related searches ['asymmetrische wahlkampfführung', '#akkrücktritt', '#artikel5', '#niemehrcdu', 'akkgate'].
Scraping https://twitter.com/hashtag/artikel5.
Level 2 search for #artikel5 yielded the related searches ['#annegeht', '@cdu', '#habeck', '#r2g', '@akk'].
Scraping https://twitter.com/hashtag/volkspartei.
Level 2 search for #volkspartei yielded the related searches ['#r2g', 'medien', '#niemehrcducsu', 'regierungsarbeit', '#niemalsafd'].
Scraping https://twitter.com/hashtag/r2g.
Level 2 search for #r2g yielded the related searches ['#niemehrcducsu', '#artikel5', '#annegeht', '#niemalsafd', 'annegate'].
Scraping https://twitter.com/hashtag/niemehrcducsu.
Level 2 search for #niemehrcducsu yielde

In [351]:
test_search.results_as_csv(only_results=True)

----- saving results only csv -----


In [368]:
test_search.export_gephi_csv()