In [14]:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
import json
import errno
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import os
from config import API_KEY_SER, coop_list, coop_ids 

### Overview: 
In this project, I gathered an Electricity Co-op (Co-op A) Google Revews using serpapi, processed the data (in a seperate notebook), and analyzed it. In this notebook, I will get the data from serpapi, process the data, and store it in a data folder

## 1. Pull Data Method

In [15]:
def pull_data(data_id):
    params = {
      "api_key": API_KEY_SER,                   # your api key
      "engine": "google_maps_reviews",                    # serpapi search engine
      "hl": "en",                                         # language of the search
      "sort_by":"newestFirst",
      "data_id": data_id  # place id data located inside Google Maps Place URL: located inside `data=` query parameter. 
    }
    search = GoogleSearch(params)

    reviews = []

    page_num = 0
    while True:
        page_num += 1
        results = search.get_dict()

        print(f"Extracting reviews from {page_num} page.")

        if not "error" in results:
            for result in results.get("reviews", []): # return an empty list [] if no reviews from the place
                reviews.append({
                    "page": page_num,
                    "name": result.get("user").get("name"),
                    "link": result.get("user").get("link"),
                    "thumbnail": result.get("user").get("thumbnail"),
                    "rating": result.get("rating"),
                    "date": result.get("date"),
                    "snippet": result.get("snippet"),
                    "images": result.get("images"),
                    "local_guide": result.get("user").get("local_guide"),
                    # other data
                })
        else:
            print(results["error"])
            break

        if results.get("serpapi_pagination") is not None:
            results.get("serpapi_pagination").get("next")
            if results.get("serpapi_pagination").get("next") and results.get("serpapi_pagination").get("next_page_token"):
                # split URL in parts as a dict and update search "params" variable to a new page that will be passed to GoogleSearch()
                search.params_dict.update(dict(parse_qsl(urlsplit(results["serpapi_pagination"]["next"]).query)))
            else:
                break
        else:
            break

    df = pd.DataFrame(reviews)
    return df

## 2. Estimate Date methods

In [16]:
def find_estimated_date(date):
#     today = datetime.today()
#     today_date_object = datetime.strptime(today, '%m-%d-%Y').date()
    today_date_object = datetime.today()
    if "day" in date:
        if "a " in date:
            day_ago  = 1
        else:
            day_ago = int(date.split()[0])
        estimated_review_date = today_date_object - timedelta(days=day_ago)
    elif "week" in date:
        if "a " in date:
            week_ago  = 1
        else:
            week_ago = int(date.split()[0])
        estimated_review_date = today_date_object - relativedelta(weeks=week_ago)
    elif "month" in date:
        if "a " in date:
            month_ago  = 1
        else:
            month_ago = int(date.split()[0])
        estimated_review_date = today_date_object - relativedelta(months=month_ago)
    else:
        if "a " in date:
            year_ago  = 1
        else:
            year_ago = int(date.split()[0])
        estimated_review_date = today_date_object - relativedelta(years=year_ago)

    return estimated_review_date

def find_year(date):
    estimated_review_date = find_estimated_date(date)
    year = estimated_review_date.year
    # Extract year
    return year

def apply_find_year(row):
    return find_year(row['date'])

def find_month(date):
    estimated_review_date = find_estimated_date(date)
    # Extract month
    month = estimated_review_date.month
    return month

def apply_find_month(row):
    return find_month(row['date'])

## 3. Clean data method

In [17]:
#Can add year and month column, drop column link, thumbnail, images, local_guide
def clean_data(data):
    data["Year"] = data.apply(apply_find_year, axis=1) 
    data = data.rename(columns={"name": "Customer Name", "rating": "Rating","snippet":"Review"})
    final_data = data[["Customer Name","Year","Rating","Review"]].copy()
    return final_data

## 4. Store data method

In [18]:
def check_exists(path):
    return os.path.exists(path)

def makedir_exist_ok(path):
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno == errno.EEXIST:
            pass
        else:
            raise
    return

#Save data
def save(input, path):
    dirname = os.path.dirname(path)
    makedir_exist_ok(dirname)
    # print(dirname)
    input.to_csv(path,index=False)
    return

## 5. Main

In [22]:
def main():
    new_pull = False
    new_process = True
    root = os.path.join('data')
    raw_folder =  os.path.join(root, 'raw')
    processed_folder =  os.path.join(root, 'processed')
    for coop in coop_list:
        data_id = coop_ids[coop]
        raw_data_path = os.path.join(raw_folder, '{}_raw.csv'.format(coop))
        processed_data_path = os.path.join(processed_folder, '{}_processed.csv'.format(coop))
        if new_pull:
            raw_data = pull_data(data_id)
            #store raw data
            save(raw_data,raw_data_path)
        
        if new_process:
            #read raw data
            raw_data_read = pd.read_csv(raw_data_path)
            #process data
            processed_data = clean_data(raw_data_read)
            #strore processed data
            save(processed_data,processed_data_path)
        print("{} Data Processing Completed".format(coop))
    return

In [None]:
if __name__ == "__main__":
    main()