# Extract Information From Personal Google Data Export

In [37]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

%matplotlib inline

# Only the "My Activity" directory is supported
path = '../Datasets/Google-20-09/My Activity/Search/MyActivity.html'

OUTPUT_FILE = 'google_data'
OUTPUT_TYPE = 'csv'

## Convert Google Data HTML To CSV

In [44]:
class ProcessGoogleData:
    ''' Handles Google Data '''

    def __init__(self):
        self.html = None
        self.cells = None
        self.df = None
        self.file_name = None
 
    def read_file(self, path):
        """ Given a file path, read it into this object. """
        print('Reading file: ', end='')
        with open(path, 'r') as f:
            self.html = f.read()    
        print('[DONE]')
            
    def parse(self):
        """ Input HTML data and parse through cells. """
        print('Parsing HTML file: ', end='')
        soup = BeautifulSoup(self.html, 'lxml')
        self.cells = soup.select('.content-cell')
        print('[DONE]')
    
    def _process_row(self, row):
        a = row.find('a')             
        row_dict = {}
        
        try:
            search_query = str(a.text)
            search_type = str(a.parent.contents[0]).strip()
            location = self._get_location(a)

            # Handle different string format for Visited pages
            if search_type == 'Visited':
                site_url = a.get('href').split('?q=')[1]
            else:
                site_url = a.get('href')

            search_date = pd.to_datetime(a.parent.contents[3])

            # Build the dictionary
            row_dict = {
                'Type':search_type,
                'Query': search_query,
                'Date': search_date,
                'URL': site_url,
                'Location': location
            }
        except Exception as e:
            pass
        
        return row_dict
    
    def _get_location(self, url):
        """ Given a list of cells, find the location URL. """
        href_next = url.find_next('a').get('href')
        
        if 'google.com/maps/@?api=1' in href_next:
            coordinates = href_next.split('center=')[1].split('&zoom')[0]
            full_maps_url = 'https://www.google.com/maps/@' + coordinates + ',17z' 
            return full_maps_url
        return None
    
    def create_table(self):
        """ Takes a BS4 element filter, and appends each Google data cell into a dataframe row. """        
        meta_dict = {}
        
        cell_list = self.cells
        
        for i, elem in enumerate(cell_list):
            row = self._process_row(elem)
            meta_dict[i] = row

            # Files are large, update status every 5000 elements
            if (i % 5000 == 0):
                print(f'Successfully processed HTML element: {i}')

        df = pd.DataFrame.from_dict(data=meta_dict, orient='index')
        print('Finished processing rows')
        self.df = df
        
    def save_file(self, file_name='google_data', file_type='csv'):
        """ Outputs the file in either of two formats before engineering more features. """
        file = file_name + '.' + file_type.lower()
        self.file_name = file
        
        if file_type.lower() == 'csv':
            self.df.to_csv(file)
        if file_type.lower() == "xlsx":
            self.df.to_excel(file)
        
        print(f'Saved {self.df.shape[0]} rows to: "{file}"')
    
    
if __name__ == '__main__':
    gd = ProcessGoogleData()
    gd.read_file(path)
    gd.parse()
    gd.create_table()
    
    # Save to CSV
    gd.save_file(file_name=OUTPUT_FILE, file_type=OUTPUT_TYPE)

Reading file: [DONE]
Parsing HTML file: [DONE]
Successfully processed HTML element: 0
Successfully processed HTML element: 5000
Successfully processed HTML element: 10000
Successfully processed HTML element: 15000
Successfully processed HTML element: 20000
Successfully processed HTML element: 25000
Successfully processed HTML element: 30000
Successfully processed HTML element: 35000
Successfully processed HTML element: 40000
Successfully processed HTML element: 45000
Successfully processed HTML element: 50000
Successfully processed HTML element: 55000
Successfully processed HTML element: 60000
Successfully processed HTML element: 65000
Successfully processed HTML element: 70000
Successfully processed HTML element: 75000
Successfully processed HTML element: 80000
Successfully processed HTML element: 85000
Successfully processed HTML element: 90000
Successfully processed HTML element: 95000
Successfully processed HTML element: 100000
Successfully processed HTML element: 105000
Successful

## Build New Features From Dataframe

In [45]:
from geopy.geocoders import Nominatim
gl = Nominatim(user_agent="Google Data App v1.0", timeout=10)

class GenerateFeatures:
    def __init__(self):
        self.known_locations = {}
        self.df = None
        
    def fit(self, df=None, y=None):
        return self

    def transform(self, df):
        """ Create the new feature transformations. """
        df['Date'] = pd.to_datetime(df['Date'], utc=True)
        df['Day'] = df['Date'].dt.date
        df['Mobile'] = self.on_mobile(df)
        df['Site'] = df['URL'].apply(self.extract_site_name)
    
        # Converting coordiantes to addresses is very slow.
        print('Converting GPS coordinates to address. Please wait.')
        df['City'] = df['Location'].apply(lambda x: self.get_address_from_coords(x))
        self.df = df
        return self
    
    def extract_site_name(self, row):
    
        row = row.replace('//amp.', '').replace('.m.', '')

        url_www = row.split('www.')
        url_https = row.split('//')
        url_mob = row.split('//m.')

        if len(url_mob) > 1:
            cleaned_row = url_mob[1].split('/')[0].split('.')[0]
            return cleaned_row.title()
        elif len(url_www) > 1:
            cleaned_row = url_www[1].split('/')[0].split('.')[0]
            return cleaned_row.title()
        elif len(url_https) > 1:
            cleaned_row = url_https[1].split('/')[0].split('.')[0]
            return cleaned_row.title()
        return row.title()

    def on_mobile(self, row):
        url_mob_formats = ['www.m.', 'https//m.', 'http://m.', '//amp.']

        for elem in url_mob_formats:
            if elem in row:
                return True
        return False
    
    def get_address_from_coords(self, row):
        """ Given a latitude & longitude --> return an approximate address. """
        if row:
            # Split the row into latitude and longitude from the URL.
            long, lat = row.split('@')[1].split(',')[0:2]
            coordinates = str(round(float(long), 2)) + ' ' + str(round(float(lat), 2))
            try:
                # Skip API call is key exists in dictionary (more server friendly)
                return self.known_locations[coordinates]
            except KeyError:
                city = gl.reverse(f"{long}, {lat}").address
                self.known_locations[coordinates] = city
                return city
        else:
            return None
    
    def save_file(self, file_name='google_data', file_type='csv'):
        file = file_name + '.' + file_type.lower()
        
        if file_type.lower() == 'csv':
            self.df.to_csv(file)
        if file_type.lower() == "xlsx":
            self.df.to_excel(file)
        
        print(f'Saved transformed dataset to: "{file}"')
        
df_new_feats = GenerateFeatures().transform(gd.df)
df_new_feats.save_file(file_name=OUTPUT_FILE, file_type=OUTPUT_TYPE)

Saved transformed dataset to: "google_data.csv"


### Cluster Top Search Queries

In [46]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

class ModelData:
    def __init__(self):
        self.df = None
        self.model_name = None
        self.my_punc = '[!"$·%&\'(”…#)—*+,-./:;<=>?“[\\]^_`{|}~•–@®]'
        self.my_stopwords = set(stopwords.words('english'))
        self.stemmer = PorterStemmer(ignore_stopwords=False).stem
        
    def remove_urls(self, query):
        """ Queries that are Google Search URLs should be filtered out. """
        if 'http' in query or 'www.' in query:
            return None
        else:
            return query

    def remove_punct(self, query, custom_punc=None):
        """ Punctuation, especially contractions will harm data quality. Filter it out and return a list of token. """
        if custom_punc:
            self.my_punc = custom_punc

        tokens = []
        
        # Split query into words, check if each word contains punctuation. If so, remove and add to tokens list. Otherwise just append as-is.
        for word in query.split():
            word_clean = ''
            for char in word:
                if char not in self.my_punc:
                    word_clean += char
            tokens.append(word_clean)
        return ' '.join(tokens)        

    def remove_stopwords(self, query):
        """ Remove the common stopwords from queries. Note: also removes negations (e.g. no, un, not) since they're stopwords. """
        tokens = word_tokenize(query)
        tokens = [word for word in tokens
                        if word not in self.my_stopwords] 
        return " ".join(tokens)

    def stem_query(self, query):
        """ Returns the stem of each word in the query. """
        tokens = []
        for word in query.split():
            tokens.append(self.stemmer(word))
        return ' '.join(tokens)

    def clean_string(self, query, bigrams=False):
        """ Performs the actual query cleaning. """
        tokens = None
        # Basic cleanup
        query = query.lower()
        query = self.remove_urls(query)
        
        # Check if query is still valid. Process for modeling if so.
        if query:
            query = self.remove_punct(query)
            query = self.remove_stopwords(query)
            query = self.stem_query(query)
            tokens = query
        
        # Combine two elements to see in bigrams.
        if bigrams:
            tokens = tokens + [tokens[i] + '_' + tokens[i+1]
                                    for i in range(len(tokens)-1)]
        return tokens        
    
    def fit(self, df):
        """ Transforms the entire Series. """
        self.df = df
        self.df['Query Clean'] = df['Query'].apply(self.clean_string)
        return self
    
    def drop_null_queries(self):
        return self.df[self.df['Query Clean'].notna()]
    
    def _vectorize(self):
        """ Convert each query string into a matrix of token counts. """
        df = self.drop_null_queries()
        # Transforms text into vector
        vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')
        print(f'Modeling on {len(df)} valid queries')
        # Apply transformation
        tf = vectorizer.fit_transform(df['Query Clean']).toarray()

        # tf_feature_names gets the meaning of each column word.
        tf_feature_names = vectorizer.get_feature_names()
        return tf, tf_feature_names
    
    def _display_topics(self, model, feature_names, no_top_words):
        """ Format the results of vectorization for display in table. Code from: https://bit.ly/3lUIWJZ """
        topic_dict = {}
        for topic_idx, topic in enumerate(model.components_):
            topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                            for i in topic.argsort()[:-no_top_words - 1:-1]]
            topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                            for i in topic.argsort()[:-no_top_words - 1:-1]]
        return pd.DataFrame(topic_dict)
    
    def generate_topics(self, model_name=None, num_topics=None, num_top_words=None):
        """ Takes a model type, # of topics, # of words per topic, and returns the topics in a table. """
        if not num_topics:
            num_topics = 10
        if not num_top_words:
            num_top_words = 5
            
        tf, tf_feature_names = self._vectorize()
        
        # Assume either LDA or NMF parameter is passed.
        if model_name == 'LDA':
            # LDA not great for short-text topic modeling. GSDMM modified LDA is better.
            lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
            lda.fit(tf)
            return self._display_topics(lda, tf_feature_names, num_top_words)
        if model_name == 'NMF':
            nmf = NMF(n_components=num_topics, alpha=.1, l1_ratio=.5)
            nmf.fit(tf)
            return self._display_topics(nmf, tf_feature_names, num_top_words)
        else:
            print('Unsupported model type')
        

In [47]:
model = ModelData()
model.fit(gd.df)

<__main__.ModelData at 0x7ff5b0aebf70>

In [48]:
topics_lda = model.generate_topics(model_name='LDA', num_topics=10, num_top_words=5)

Modeling on 30223 valid queries


In [None]:
topics_lda

In [17]:
topics_nmf = model.generate_topics(model_name='NMF', num_topics=10, num_top_words=5)

Modeling on 1354 queries


In [None]:
topics_nmf