In [12]:
import logging
import os
import pandas as pd
from urllib.parse import urlparse
from typing import Dict
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest



In [29]:
def get_ga4_data(property_id: str, days_ago: int = 30):
    client = BetaAnalyticsDataClient()
    
    try:
        # Перший запит - основні метрики
        request1 = RunReportRequest(
            property=f"properties/{property_id}",
            date_ranges=[DateRange(start_date=f"{days_ago}daysAgo", end_date="today")],
            dimensions=[
                Dimension(name=d) for d in [
                    "sessionCampaignId", "sessionCampaignName", "sessionSource",
                    "hostName", "country", "pagePathPlusQueryString", 
                    "deviceCategory", "date", "userId"
                ]
            ],
            metrics=[
                Metric(name=m) for m in ["sessions", "publisherAdImpressions", "publisherAdClicks", "totalRevenue"]
            ]
        )
        
        # Другий запит - додаткові дані про сторінки
        request2 = RunReportRequest(
            property=f"properties/{property_id}",
            date_ranges=[DateRange(start_date=f"{days_ago}daysAgo", end_date="today")],
            dimensions=[
                Dimension(name=d) for d in [
                    "date", "hostName", "pagePath", 
                    "landingPage", "pageReferrer", "userId"
                ]
            ],
            metrics=[
                Metric(name=m) for m in ["screenPageViews", "bounceRate"]
            ]
        )
        
        response1 = client.run_report(request1)
        response2 = client.run_report(request2)
        
        # Обробка першого response
        data1 = []
        for row in response1.rows:
            d = row.dimension_values
            m = row.metric_values
            
            blocks = None
            if 'utm_blocks=' in d[5].value:
                try: blocks = d[5].value.split('utm_blocks=')[1].split('&')[0]
                except: pass
                    
            data1.append({
                'campaign_id': d[0].value,
                'campaign_name': d[1].value,
                'network': d[2].value,
                'domain': d[3].value,
                'geo': d[4].value,
                'blocks': blocks,
                'device_type': d[6].value,
                'date': pd.to_datetime(d[7].value),
                'user_id': d[8].value,
                'sessions': int(float(m[0].value)),
                'impressions': int(float(m[1].value)),
                'clicks': int(float(m[2].value)),
                'revenue': float(m[3].value)
            })
        
        # Обробка другого response
        data2 = []
        for row in response2.rows:
            d = row.dimension_values
            m = row.metric_values
            
            data2.append({
                'date': pd.to_datetime(d[0].value),
                'domain': d[1].value,
                'page_path': d[2].value,
                'landing_page': d[3].value,
                'referrer': d[4].value,
                'user_id': d[5].value,
                'page_views': int(float(m[0].value)),
                'bounce_rate': float(m[1].value)
            })
        
        # Створюємо DataFrame'и
        df1 = pd.DataFrame(data1)
        df2 = pd.DataFrame(data2)
        
        # Об'єднуємо за date, domain та user_id
        merged_df = pd.merge(df1, df2, on=['date', 'domain', 'user_id'], how='outer')
        
        # Групуємо за user_id
        grouped_df = merged_df.groupby('user_id').agg({
            'sessions': 'sum',
            'impressions': 'sum',
            'clicks': 'sum',
            'revenue': 'sum',
            'page_views': 'sum',
            'bounce_rate': 'mean',
            'campaign_id': 'first',
            'campaign_name': 'first',
            'network': 'first',
            'domain': 'first',
            'geo': 'first',
            'blocks': 'first',
            'device_type': 'first',
            'date': 'min',
            'page_path': 'first',
            'landing_page': 'first',
            'referrer': 'first'
        }).reset_index()
        
        return grouped_df
        
    except Exception as e:
        print(f"Error accessing GA4: {str(e)}")
        raise

In [30]:
sites_data = {
    "online-dating-review.net": 450191495,
    "avodate.com": 350536871,
    "datempire.com": 358067421,
    "feelflame.com": 358106858,
    "latidate.com": 358050088,
    "myspecialdates.com": 322504563,
    "okamour.com": 350538354,
    "sakuradate.com": 358590047,
    "sofiadate.com": 322587243,
    "loveforheart.com": 322569296
}

for domain, ga_id in sites_data.items():
    try:
        df = get_ga4_data(str(ga_id), days_ago=30)
        df.to_csv(f"ga4_data_{domain.split('.')[0]}.csv", index=False)
        print(f"Processed: {domain}")
    except Exception as e:
        print(f"Error with {domain}: {e}")

Error accessing GA4: 400 Did you mean itemId? Field userId is not a valid dimension. For a list of valid dimensions and metrics, see https://developers.google.com/analytics/devguides/reporting/data/v1/api-schema 
Error with online-dating-review.net: 400 Did you mean itemId? Field userId is not a valid dimension. For a list of valid dimensions and metrics, see https://developers.google.com/analytics/devguides/reporting/data/v1/api-schema 
Error accessing GA4: 400 Did you mean itemId? Field userId is not a valid dimension. For a list of valid dimensions and metrics, see https://developers.google.com/analytics/devguides/reporting/data/v1/api-schema 
Error with avodate.com: 400 Did you mean itemId? Field userId is not a valid dimension. For a list of valid dimensions and metrics, see https://developers.google.com/analytics/devguides/reporting/data/v1/api-schema 
Error accessing GA4: 400 Did you mean itemId? Field userId is not a valid dimension. For a list of valid dimensions and metrics, 

In [21]:
credentials_path = "/Users/mac/Downloads/Redshift integration-bdb44d4849b7.json"
test_site = {"online-dating-review.net": 450191495}

extractor = GA4DataExtractor(credentials_path)
extractor.process_sites(test_site)

2024-11-19 16:10:42,279 - INFO - Processing online-dating-review.net
2024-11-19 16:10:44,848 - ERROR - Error accessing GA4: 'date'
2024-11-19 16:10:44,849 - ERROR - Error processing online-dating-review.net: 'date'
