# Extract Information From Personal Google Data Export

In [17]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

%matplotlib inline

path = 'Datasets/Google-20-09-20/My Activity/Search/MyActivity.html'

OUTPUT_FILE = 'google_data'
OUTPUT_TYPE = 'csv'

## Convert Google Data HTML To CSV

In [4]:
class ProcessGoogleData:
    ''' Handles Google Data '''

    def __init__(self):
        self.html = None
        self.cells = None
        self.df = None
        self.file_name = None
 
    def read_file(self, path):
        print('Reading file: ', end='')
        with open(path, 'r') as f:
            self.html = f.read()    
        print('[DONE]')
            
    def parse(self):
        """ Input HTML data and parse through cells. """
        print('Parsing HTML file: ', end='')
        soup = BeautifulSoup(self.html, 'lxml')
        self.cells = soup.select('.content-cell')
        print('[DONE]')
    
    def _process_row(self, row):
        a = row.find('a')             
        row_dict = {}
        
        try:
            search_query = str(a.text)
            search_type = str(a.parent.contents[0]).strip()
            location = self._get_location(a)

            # Handle different string format for Visited pages
            if search_type == 'Visited':
                site_url = a.get('href').split('?q=')[1]
            else:
                site_url = a.get('href')

            search_date = pd.to_datetime(a.parent.contents[3])

            # Build the dictionary
            row_dict = {
                'Type':search_type,
                'Query': search_query,
                'Date': search_date,
                'URL': site_url,
                'Location': location
            }
        except Exception as e:
            pass
        
        return row_dict
    
    def _get_location(self, url):
        """ Given a list of cells, find the location URL. """
        href_next = url.find_next('a').get('href')
        
        if 'google.com/maps/@?api=1' in href_next:
            coordinates = href_next.split('center=')[1].split('&zoom')[0]
            full_maps_url = 'https://www.google.com/maps/@' + coordinates + ',17z' 
            return full_maps_url
        return None
    
    def create_table(self):
        """ Takes a BS4 element filter, and appends each Google data cell into a dataframe row. """        
        meta_dict = {}
        
        cell_list = self.cells
        
        for i, elem in enumerate(cell_list):
#             if i == 10:
#                 break
            row = self._process_row(elem)
            meta_dict[i] = row

            # Files are large, update status every 5000 elements
            if (i % 5000 == 0):
                print(f'Successfully processed HTML element: {i}')

        df = pd.DataFrame.from_dict(data=meta_dict, orient='index')
        print('Finished processing rows')
        self.df = df
        
    def save_file(self, file_name='google_data', file_type='csv'):
        file = file_name + '.' + file_type.lower()
        self.file_name = file
        
        if file_type.lower() == 'csv':
            self.df.to_csv(file)
        if file_type.lower() == "xlsx":
            self.df.to_excel(file)
        
        print(f'Saved {df.shape[0]} rows to: "{file}"')
    
    
if __name__ == '__main__':
    gd = ProcessGoogleData()
    gd.read_file(path)
    gd.parse()
    gd.create_table()
    
    # Save to CSV
    gd.save_file(file_name=OUTPUT_FILE, file_type=OUTPUT_TYPE)

Reading file: [DONE]
Parsing HTML file: [DONE]
Successfully processed row: 0
Successfully processed row: 5000
Successfully processed row: 10000
Successfully processed row: 15000
Successfully processed row: 20000
Successfully processed row: 25000
Successfully processed row: 30000
Successfully processed row: 35000
Successfully processed row: 40000
Successfully processed row: 45000
Successfully processed row: 50000
Successfully processed row: 55000
Successfully processed row: 60000
Successfully processed row: 65000
Successfully processed row: 70000
Successfully processed row: 75000
Successfully processed row: 80000
Successfully processed row: 85000
Successfully processed row: 90000
Successfully processed row: 95000
Successfully processed row: 100000
Successfully processed row: 105000
Successfully processed row: 110000
Successfully processed row: 115000
Finished processing rows
Saved to:  google_data.csv


## Build New Features From Dataframe

In [14]:
from geopy.geocoders import Nominatim
gl = Nominatim(user_agent="Google Data App v1.0", timeout=10)

class GenerateFeatures:
    def __init__(self):
        self.known_locations = {}
        self.df = None
        
    def fit(self, df=None, y=None):
        return self

    def transform(self, df):
        df['Date'] = pd.to_datetime(df['Date'], utc=True)
        df['Day'] = df['Date'].dt.date
        df['Mobile'] = self.on_mobile(df)
        df['Site'] = df['URL'].apply(self.extract_site_name)
    
        # Converting coordiantes to addresses is very slow.
        df['City'] = df['Location'].apply(lambda x: self.get_address_from_coords(x))
        self.df = df
        return df
    
    def extract_site_name(self, row):
    
        row = row.replace('//amp.', '').replace('.m.', '')

        url_www = row.split('www.')
        url_https = row.split('//')
        url_mob = row.split('//m.')

        if len(url_mob) > 1:
            cleaned_row = url_mob[1].split('/')[0].split('.')[0]
            return cleaned_row.title()
        elif len(url_www) > 1:
            cleaned_row = url_www[1].split('/')[0].split('.')[0]
            return cleaned_row.title()
        elif len(url_https) > 1:
            cleaned_row = url_https[1].split('/')[0].split('.')[0]
            return cleaned_row.title()
        return row.title()

    def on_mobile(self, row):
        url_mob_formats = ['www.m.', 'https//m.', 'http://m.', '//amp.']

        for elem in url_mob_formats:
            if elem in row:
                return True
        return False
    
    def get_address_from_coords(self, row):
        """ Given a latitude & longitude --> return an approximate address. """
        if row:
            # Split the row into latitude and longitude from the URL.
            long, lat = row.split('@')[1].split(',')[0:2]
            coordinates = str(round(float(long), 2)) + ' ' + str(round(float(lat), 2))
            try:
                # Skip API call is key exists in dictionary (more server friendly)
                return self.known_locations[coordinates]
            except KeyError:
                city = gl.reverse(f"{long}, {lat}").address
                self.known_locations[coordinates] = city
                return city
        else:
            return None
    
    def save_file(self, file_name='google_data', file_type='csv'):
        file = file_name + '.' + file_type.lower()
        
        if file_type.lower() == 'csv':
            self.df.to_csv(file)
        if file_type.lower() == "xlsx":
            self.df.to_excel(file)
        
        print(f'Saved transformed dataset to: "{file}"')
        
df_new_feats = GenerateFeatures().transform(gd.df)
df_new_feats.save_file(file_name=OUTPUT_FILE, file_type=OUTPUT_TYPE)

Unnamed: 0,Type,Query,Date,URL,Location,Day,Mobile,Site,City
0,Searched for,ayurveda dataset,2020-09-19 05:26:40+00:00,https://www.google.com/search?q=ayurveda+dataset,"https://www.google.com/maps/@40.723780,-73.971...",2020-09-19,False,Google,"East River, New York County, New York, United ..."
3,Visited,How Data Mining is useful in Ayurveda - Journa...,2020-09-19 05:17:48+00:00,http://www.ayurvedjournal.com/JAHM_201623_01.p...,,2020-09-19,False,Ayurvedjournal,
6,Visited,Graph Algorithms: Practical Examples in Apache...,2020-09-09 03:11:32+00:00,https://www.goodreads.com/book/show/42832585-g...,,2020-09-09,False,Goodreads,
9,Searched for,Graph Algorithms: Practical Examples in Apache...,2020-09-09 03:11:23+00:00,https://www.google.com/search?q=Graph+Algorith...,"https://www.google.com/maps/@40.723780,-73.971...",2020-09-09,False,Google,"East River, New York County, New York, United ..."
12,Searched for,graph analytics data science book,2020-09-09 03:10:53+00:00,https://www.google.com/search?q=graph+analytic...,"https://www.google.com/maps/@40.723780,-73.971...",2020-09-09,False,Google,"East River, New York County, New York, United ..."
...,...,...,...,...,...,...,...,...,...
116052,Visited,http://www.stack.com/2013/11/11/at-home-should...,2015-08-17 01:21:25+00:00,http://www.stack.com/2013/11/11/at-home-should...,,2015-08-17,False,Stack,
116055,Visited,http://ashotofadrenaline.net/20-hardest-should...,2015-08-17 01:15:51+00:00,http://ashotofadrenaline.net/20-hardest-should...,,2015-08-17,False,Ashotofadrenaline,
116058,Searched for,no weight shoulder exercises,2015-08-17 01:15:40+00:00,https://www.google.com/search?q=no+weight+shou...,,2015-08-17,False,Google,
116061,Visited,http://www.mensfitness.com/training/workout-ro...,2015-08-17 01:11:06+00:00,http://www.mensfitness.com/training/workout-ro...,,2015-08-17,False,Mensfitness,


### Cluster Top Search Queries

In [123]:
# LDA known not to be perfect for short-text topic modeling, but requires additionally library for GSDMM implementation of modified LDA
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

class ModelData:
    def __init__(self):
        self.df = None
        self.model_name = None
        self.my_punc = '[!"$·%&\'(”…#)—*+,-./:;<=>?“[\\]^_`{|}~•–@®]'
        self.my_stopwords = set(stopwords.words('english'))
        self.stemmer = PorterStemmer(ignore_stopwords=False).stem
        
    def remove_urls(self, query):
        """ Queries that are Google Search URLs should be filtered out. """
        if 'http' in query or 'www.' in query:
            return None
        else:
            return query

    def remove_punct(self, query, custom_punc=None):
        """ Punctuation, especially contractions will harm data quality. Filter it out and return a list of token. """
        if custom_punc:
            self.my_punc = custom_punc

        tokens = []
        
        # Split query into words, check if each word contains punctuation. If so, remove and add to tokens list. Otherwise just append as-is.
        for word in query.split():
            word_clean = ''
            for char in word:
                if char not in self.my_punc:
                    word_clean += char
            tokens.append(word_clean)
        return ' '.join(tokens)        

    def remove_stopwords(self, query):
        """ Remove the common stopwords from queries. Note: also removes negations (e.g. no, un, not) since they're stopwords. """
        tokens = word_tokenize(query)
        tokens = [word for word in tokens
                        if word not in self.my_stopwords] 
        return " ".join(tokens)

    def stem_query(self, query):
        """ Returns the stem of each word in the query. """
        tokens = []
        for word in query.split():
            tokens.append(self.stemmer(word))
        return ' '.join(tokens)

    def clean_string(self, query, bigrams=False):
        """ Performs the actual query cleaning. """
        tokens = None
        # Basic cleanup
        query = query.lower()
        query = self.remove_urls(query)
        
        # Check if query is still valid. Process for modeling if so.
        if query:
            query = self.remove_punct(query)
            query = self.remove_stopwords(query)
            query = self.stem_query(query)
            tokens = query
        
        # Combine two elements to see in bigrams.
        if bigrams:
            tokens = tokens + [tokens[i] + '_' + tokens[i+1]
                                    for i in range(len(tokens)-1)]
        return tokens        
    
    def fit(self, df):
        """ Transforms the entire Series. """
        self.df = df
        self.df['Query Clean'] = df['Query'].apply(self.clean_string)
        return self
    
    def drop_null_queries(self):
        return self.df[self.df['Query Clean'].notna()]
    
    def _vectorize(self):
        """ Convert each query string into a matrix of token counts. """
        df = self.drop_null_queries()
        # Transforms text into vector
        vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')
        print(f'Modeling on {len(df)} queries')
        # Apply transformation
        tf = vectorizer.fit_transform(df['Query Clean']).toarray()

        # tf_feature_names gets the meaning of each column word.
        tf_feature_names = vectorizer.get_feature_names()
        return tf, tf_feature_names
    
    def _display_topics(self, model, feature_names, no_top_words):
        """ Format the results of vectorization for display in table. Code from: https://bit.ly/3lUIWJZ """
        topic_dict = {}
        for topic_idx, topic in enumerate(model.components_):
            topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                            for i in topic.argsort()[:-no_top_words - 1:-1]]
            topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                            for i in topic.argsort()[:-no_top_words - 1:-1]]
        return pd.DataFrame(topic_dict)
    
    def generate_topics(self, model_name=None, num_topics=None, num_top_words=None):
        """ Takes a model type, # of topics, # of words per topic, and returns the topics in a table. """
        if not num_topics:
            num_topics = 10
        if not num_top_words:
            num_top_words = 5
            
        tf, tf_feature_names = self._vectorize()
        
        # Assume either LDA or NMF parameter is passed.
        if model_name == 'LDA':
            lda = LatentDirichletAllocation(n_components=num_topics, random_state=0)
            lda.fit(tf)
            return self._display_topics(lda, tf_feature_names, num_top_words)
        if model_name == 'NMF':
            nmf = NMF(n_components=num_topics, random_state=0, alpha=.1, l1_ratio=.5)
            nmf.fit(tf)
            return self._display_topics(nmf, tf_feature_names, num_top_words)
        else:
            print('Unsupported model type')
        

In [125]:
model = ModelData()
model.fit(gd.df)

<__main__.ModelData at 0x7fe8c6f71d00>

In [129]:
topics_lda = model.generate_topics(model_name='LDA', num_topics=10, num_top_words=5)

Modeling on 30223 queries


In [None]:
topics_lda

In [127]:
topics_nmf = model.generate_topics(model_name='NMF', num_topics=10, num_top_words=5)

Modeling on 30223 queries




In [128]:
topics_nmf

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,python,8.9,ben,5.5,mac,8.5,best,8.2,data,7.0,stack,6.1,colleg,6.0,linux,5.5,reddit,7.6,download,7.1
1,get,1.0,greenfield,5.4,torrent,0.8,2020,0.9,scienc,3.6,overflow,5.1,health,2.1,kali,4.3,io,1.3,torrent,1.8
2,string,0.8,fit,2.0,x,0.8,onlin,0.7,scientist,0.7,use,1.3,claremont,1.7,forum,1.8,jailbreak,0.7,mp3,1.7
3,list,0.7,workout,0.4,os,0.8,cours,0.6,learn,0.4,get,1.1,pitzer,1.1,instal,1.1,vs,0.4,io,1.3
4,file,0.6,healthi,0.3,app,0.7,app,0.6,machin,0.4,file,0.8,pomona,0.9,raspberri,0.7,coupon,0.4,free,1.2


### Now Obselete

In [75]:
# LDA known not to be perfect for short-text topic modeling, but requires additionally library for GSDMM implementation of modified LDA
from nltk.tokenize import word_tokenize
import nltk
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '[!"$·%&\'(”…#)—*+,-./:;<=>?“[\\]^_`{|}~•–@®]'

def remove_urls(query):
    """ Queries that are Google Search URLs should be filtered out. """
    if 'http' in query or 'www.' in query:
        return None
    else:
        return query

def remove_punct(query, custom_punc=None):
    """ Punctuation, especially contractions will harm data quality. Filter it out and return a list of token. """
    if custom_punc:
        my_punctuation = custom_punc
    else:
        my_punctuation = '[!"$·%&\'(”…#)—*+,-./:;<=>?“[\\]^_`{|}~•–@®]'

    tokens = []
    
    # Split query into words, check if each word contains punctuation. If so, remove and add to tokens list. Otherwise just append as-is.
    for word in query.split():
        word_clean = ''
        for char in word:
            if char not in my_punctuation:
                word_clean += char
        tokens.append(word_clean)
    
    return ' '.join(tokens)        
    
def remove_stopwords(query):
    """ Remove the common stopwords from queries. Note: also removes negations (e.g. no, un, not) since they're stopwords. """
    tokens = word_tokenize(query)
    tokens = [word for word in tokens
                    if word not in my_stopwords] 
    return " ".join(tokens)
    
def stem_query(query):
    tokens = []
    for word in query:
        tokens.append(word_rooter(word))
    return " ".join(tokens)
    
def clean_search(query, bigrams=False):
    # Basic cleanup
    query = query.lower()
    query = remove_urls(query)
    
    # Check if it's still valid. Process for modeling if so.
    if query:
        query = remove_punct(query)
        query = remove_stopwords(query)
        tokens = stem_query(query)
    
    # Combine two elements to see in bigrams.
    if bigrams:
        tokens = tokens + [tokens[i] + '_' + tokens[i+1]
                                for i in range(len(tokens)-1)]
    
    return query

df = gd.df
df['Clean Query'] = df['Query'].apply(clean_search)

In [349]:
# Thanks to https://ourcodingclub.github.io/tutorials/topic-modelling-python/#who_what

In [67]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(search_query_df['Clean Query']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()
# tf_feature_names

### Create Topic Table

In [71]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

### LDA

In [72]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
model.fit(tf)


LatentDirichletAllocation(random_state=0)

In [135]:
no_top_words = 5

print('My Top Google Searches (LDA):')
display_topics(model, tf_feature_names, no_top_words)

My Top Google Searches (LDA):


AttributeError: 'ModelData' object has no attribute 'components_'

### NMF

In [51]:
from sklearn.decomposition import NMF

nmf = NMF(n_components=number_of_topics, random_state=0, alpha=.1, l1_ratio=.5)

nmf.fit(tf)



NMF(alpha=0.1, l1_ratio=0.5, n_components=10, random_state=0)

In [138]:
print('My Top Google Searches (NMF):')
display_topics(nmf, tf_feature_names, no_top_words)

My Top Google Searches (NMF):


IndexError: list index out of range

In [146]:
gd.df

Unnamed: 0,Type,Query,Date,URL,Location,Day,Mobile,City,Site,Clean Query,Query Clean
0,Searched for,ayurveda dataset,2020-09-19 05:26:40+00:00,https://www.google.com/search?q=ayurveda+dataset,"https://www.google.com/maps/@40.723780,-73.971...",2020-09-19,False,"East River, New York County, New York, United ...",Google,ayurveda dataset,ayurveda dataset
3,Visited,How Data Mining is useful in Ayurveda - Journa...,2020-09-19 05:17:48+00:00,http://www.ayurvedjournal.com/JAHM_201623_01.p...,,2020-09-19,False,,Ayurvedjournal,data mining useful ayurveda journal ayurvedic,how data mining is useful in ayurveda journal...
6,Visited,Graph Algorithms: Practical Examples in Apache...,2020-09-09 03:11:32+00:00,https://www.goodreads.com/book/show/42832585-g...,,2020-09-09,False,,Goodreads,graph algorithms practical examples apache spa...,graph algorithms practical examples in apache ...
9,Searched for,Graph Algorithms: Practical Examples in Apache...,2020-09-09 03:11:23+00:00,https://www.google.com/search?q=Graph+Algorith...,"https://www.google.com/maps/@40.723780,-73.971...",2020-09-09,False,"East River, New York County, New York, United ...",Google,graph algorithms practical examples apache spa...,graph algorithms practical examples in apache ...
12,Searched for,graph analytics data science book,2020-09-09 03:10:53+00:00,https://www.google.com/search?q=graph+analytic...,"https://www.google.com/maps/@40.723780,-73.971...",2020-09-09,False,"East River, New York County, New York, United ...",Google,graph analytics data science book,graph analytics data science book
...,...,...,...,...,...,...,...,...,...,...,...
116052,Visited,http://www.stack.com/2013/11/11/at-home-should...,2015-08-17 01:21:25+00:00,http://www.stack.com/2013/11/11/at-home-should...,,2015-08-17,False,,Stack,,
116055,Visited,http://ashotofadrenaline.net/20-hardest-should...,2015-08-17 01:15:51+00:00,http://ashotofadrenaline.net/20-hardest-should...,,2015-08-17,False,,Ashotofadrenaline,,
116058,Searched for,no weight shoulder exercises,2015-08-17 01:15:40+00:00,https://www.google.com/search?q=no+weight+shou...,,2015-08-17,False,,Google,weight shoulder exercises,no weight shoulder exercises
116061,Visited,http://www.mensfitness.com/training/workout-ro...,2015-08-17 01:11:06+00:00,http://www.mensfitness.com/training/workout-ro...,,2015-08-17,False,,Mensfitness,,
