<a href="https://colab.research.google.com/github/mtrx-fin/Paper24/blob/main/customer_list_for_meta_audience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#1. Загружаем df_bg из BigQuery

In [None]:
!pip install --quiet google-cloud-bigquery pandas_gbq

from google.colab import auth
auth.authenticate_user()

from google.cloud import bigquery

project_id = "mtrx-analytics-bigquery-4e26"   # <-- твой project_id

client = bigquery.Client(project=project_id)

query = """
SELECT
  id AS user_id,
  FORMAT_TIMESTAMP('%y-%m-%d', createdAt),
  lastActivityAt,
  firstName,
  lastName,
  city as bgCity,
  country as bgCountry,
  email as bgEmail,
  emailConfirmed,
  statistic
FROM `mtrx-analytics-bigquery-4e26.analytics.users`
WHERE role = 'Customer'
"""

df_bq = client.query(query).to_dataframe()

df_bq.info()
print(len(df_bq))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83363 entries, 0 to 83362
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   user_id         83363 non-null  object             
 1   f0_             83363 non-null  object             
 2   lastActivityAt  83363 non-null  datetime64[us, UTC]
 3   firstName       47048 non-null  object             
 4   lastName        44724 non-null  object             
 5   bgCity          83363 non-null  object             
 6   bgCountry       83363 non-null  object             
 7   bgEmail         83363 non-null  object             
 8   emailConfirmed  83363 non-null  object             
 9   statistic       83363 non-null  object             
dtypes: datetime64[us, UTC](1), object(9)
memory usage: 6.4+ MB
83363


#2. Загружаем df_seg из Google Sheets

In [None]:
!pip install --quiet gspread gspread_dataframe

from google.colab import auth
auth.authenticate_user()  # окно авторизации Google

import gspread
from gspread_dataframe import get_as_dataframe
from google.auth import default

# Получаем креды из окружения Colab
creds, _ = default()
gc = gspread.authorize(creds)

# Открываем таблицу по ID
sh = gc.open_by_key("1VsCFEXV4-RqolOkf7eAQTe78QmgyO5XgG33zXKFtZic")

# Берём нужный лист (вкладка "abcd segmentation")
ws = sh.worksheet("abcd segmentation")   # если имя отличается – подправь

# Грузим в pandas
df_seg = get_as_dataframe(
    ws,
    evaluate_formulas=True,
    header=0    # Важно: первая строка = заголовки
)

df_seg.info(), len(df_seg)
print(df_seg.columns.tolist())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83364 entries, 0 to 83363
Data columns (total 37 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Customer ID                    83364 non-null  object 
 1   Category                       83364 non-null  object 
 2   Email                          83364 non-null  object 
 3   Country                        83364 non-null  object 
 4   First Name                     83364 non-null  object 
 5   Last Name                      83360 non-null  object 
 6   Orders qty                     83364 non-null  int64  
 7   Tips qty                       83364 non-null  int64  
 8   Pages qty                      83364 non-null  float64
 9   Total Value                    83364 non-null  float64
 10  Duration (Days)                83364 non-null  float64
 11  Avg Order Value                83364 non-null  float64
 12  Avg Work Volume (Pages)        83364 non-null 

In [None]:
print(df_seg.columns.tolist())
print(df_bq.columns.tolist())


['Customer ID', 'Category', 'Email', 'Country', 'First Name', 'Last Name', 'Orders qty', 'Tips qty', 'Pages qty', 'Total Value', 'Duration (Days)', 'Avg Order Value', 'Avg Work Volume (Pages)', 'Tip Rate', 'Days Since Last Order', 'Recency Category', 'Self Approved Orders qty', 'Order Self Approve Rate', 'Canceled Orders qty', 'Order Canceled Rate', 'Revision Qty', 'Revision Rate', 'Revision v2 Qty', 'Revision Rate v2', 'AOV Per Category', 'Avg Orders qty Per Category', 'Customers qty Per Category', 'Avg User Revenue Per Category', 'LTV', 'Recency value', 'Frequency value', 'Monetary value', 'Recency score', 'Frequency score', 'Monetary score', 'Total Score', 'RFM Category']
['user_id', 'f0_', 'lastActivityAt', 'firstName', 'lastName', 'bgCity', 'bgCountry', 'bgEmail', 'emailConfirmed', 'statistic']


Готовим df_seg под merge

In [None]:
import pandas as pd

# переименуем нужные поля
df_seg_renamed = df_seg.rename(columns={
    'Customer ID': 'user_id',
    'Email': 'segEmail',
    'Country': 'segCountry',
    'First Name': 'segFirstName',
    'Last Name': 'segLastName',
    'Total Value': 'value',
    'Total Score': 'rfmScore'
})

# оставим только нужные колонки
df_seg_small = df_seg_renamed[
    ['user_id', 'segEmail', 'segCountry', 'segFirstName', 'segLastName', 'value', 'rfmScore']
]

In [None]:
# на всякий случай приводим ID к строке в обоих фреймах
df_bq['user_id'] = df_bq['user_id'].astype(str)
df_seg_small['user_id'] = df_seg_small['user_id'].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_seg_small['user_id'] = df_seg_small['user_id'].astype(str)


Делаем INNER JOIN по user_id и собираем финальную таблицу

In [None]:
df_merged = df_bq.merge(
    df_seg_small,
    on='user_id',
    how='inner'   # только пересечение по user_id
)

df_merged.info(), len(df_merged)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83363 entries, 0 to 83362
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   user_id         83363 non-null  object             
 1   f0_             83363 non-null  object             
 2   lastActivityAt  83363 non-null  datetime64[us, UTC]
 3   firstName       47048 non-null  object             
 4   lastName        44724 non-null  object             
 5   bgCity          83363 non-null  object             
 6   bgCountry       83363 non-null  object             
 7   bgEmail         83363 non-null  object             
 8   emailConfirmed  83363 non-null  object             
 9   statistic       83363 non-null  object             
 10  segEmail        83363 non-null  object             
 11  segCountry      83363 non-null  object             
 12  segFirstName    83363 non-null  object             
 13  segLastName     83359 non-null 

(None, 83363)

Очистка колонок segFirstName, segLastName, segCountry от "Unknown"

In [None]:
import numpy as np

cols_to_clean = ['segFirstName', 'segLastName', 'segCountry']
df_merged[cols_to_clean] = df_merged[cols_to_clean].replace("Unknown", np.nan)
df_merged[cols_to_clean].isna().sum()


Unnamed: 0,0
segFirstName,36325
segLastName,41054
segCountry,1629


In [None]:
import numpy as np
import pandas as pd

# 1. Находим все объектные (строковые) колонки
str_cols = df_merged.select_dtypes(include='object').columns

# 2. Приводим None → NaN, чтобы всё было единообразно
df_merged[str_cols] = df_merged[str_cols].replace({None: np.nan})

# 3. Заменяем все NaN в строковых колонках на пустую строку
df_merged[str_cols] = df_merged[str_cols].fillna("")

# Проверяем, остались ли NaN в строковых колонках
df_merged[str_cols].isna().sum().sum()

df_merged.info(), len(df_merged)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83363 entries, 0 to 83362
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   user_id         83363 non-null  object             
 1   f0_             83363 non-null  object             
 2   lastActivityAt  83363 non-null  datetime64[us, UTC]
 3   firstName       83363 non-null  object             
 4   lastName        83363 non-null  object             
 5   bgCity          83363 non-null  object             
 6   bgCountry       83363 non-null  object             
 7   bgEmail         83363 non-null  object             
 8   emailConfirmed  83363 non-null  object             
 9   statistic       83363 non-null  object             
 10  segEmail        83363 non-null  object             
 11  segCountry      83363 non-null  object             
 12  segFirstName    83363 non-null  object             
 13  segLastName     83363 non-null 

(None, 83363)

In [None]:
# Чистим колонку rfmScore от "Not applicable"
df_merged['rfmScore'] = (
    df_merged['rfmScore']
        .replace("Not applicable", 0)   # строку → 0
        .fillna(0)                      # NaN → 0
)
df_merged['rfmScore'] = pd.to_numeric(df_merged['rfmScore'], errors='coerce').fillna(0)
df_merged['rfmScore'].value_counts().head(10)


  .replace("Not applicable", 0)   # строку → 0


Unnamed: 0_level_0,count
rfmScore,Unnamed: 1_level_1
0,80066
5,406
4,388
3,378
12,356
6,343
9,337
7,308
8,298
10,261


округление + форматирование (строго 2 знака)

In [None]:
df_merged['value'] = df_merged['value'].astype(float).round(2).map(lambda x: f"{x:.2f}")
df_merged['value'].sample(10)


Unnamed: 0,value
63961,0.0
15889,0.0
74582,39.68
47903,0.0
8819,313.75
53292,0.0
82433,0.0
9100,0.0
55432,0.0
56231,0.0


In [None]:
# колонки, которые должны быть строками
name_cols = ['firstName', 'lastName', 'segFirstName', 'segLastName', 'segCountry', 'bgCountry']

# шаг 1: приводим None → NaN
df_merged[name_cols] = df_merged[name_cols].replace({None: np.nan})

# шаг 2: заменяем NaN → пустая строка
df_merged[name_cols] = df_merged[name_cols].fillna("")

# убедимся что всё ок:
df_merged[name_cols].head()


Unnamed: 0,firstName,lastName,segFirstName,segLastName,segCountry,bgCountry
0,,,,,Ukraine,Ukraine
1,,,,,Ukraine,Ukraine
2,,,,,Ukraine,Ukraine
3,,,,,Ukraine,Ukraine
4,,,,,United States,United States


Подсчёт количества совпадений 1-в-1

In [None]:
matches_country = (df_merged['bgCountry'] == df_merged['segCountry']).sum()
matches_email = (df_merged['bgEmail'].str.lower() == df_merged['segEmail'].str.lower()).sum()
matches_fn = (df_merged['firstName'].str.lower() == df_merged['segFirstName'].str.lower()).sum()
matches_ln = (df_merged['lastName'].str.lower() == df_merged['segLastName'].str.lower()).sum()
print("Совпадений по стране:", matches_country)
print("Совпадений по email:", matches_email)
print("Совпадений по first name:", matches_fn)
print("Совпадений по last name:", matches_ln)

Совпадений по стране: 83363
Совпадений по email: 83363
Совпадений по first name: 83323
Совпадений по last name: 83062


Проверяем, где именно возникают несовпадения

In [None]:
# Строки, где имена НЕ совпали
fn_errors = df_merged[
    df_merged['firstName'].str.lower() != df_merged['segFirstName'].str.lower()
]

ln_errors = df_merged[
    df_merged['lastName'].str.lower() != df_merged['segLastName'].str.lower()
]

len(fn_errors), len(ln_errors)

(40, 301)

Смотрим сколько несовпадений вызваны NaN

In [None]:
fn_nan = fn_errors[
    fn_errors['firstName'].isna() | fn_errors['segFirstName'].isna()
]

len(fn_nan), len(fn_errors)


(0, 40)

In [None]:
ln_nan = ln_errors[
    ln_errors['lastName'].isna() | ln_errors['segLastName'].isna()
]

len(ln_nan), len(ln_errors)


(0, 301)

In [None]:
# Словарь: название из таблицы -> ISO-3166-1 alpha-2
country_to_iso = {
    "": "",
    "Albania": "AL",
    "Algeria": "DZ",
    "American Samoa": "AS",
    "Andorra": "AD",
    "Angola": "AO",
    "Anguilla": "AI",
    "Antigua and Barbuda": "AG",
    "Argentina": "AR",
    "Armenia": "AM",
    "Australia": "AU",
    "Austria": "AT",
    "Azerbaijan": "AZ",
    "Bahamas": "BS",
    "Bahrain": "BH",
    "Barbados": "BB",
    "Belarus": "BY",
    "Belgium": "BE",
    "Belize": "BZ",
    "Bhutan": "BT",
    "Bolivia": "BO",
    "Bosnia and Herzegovina": "BA",
    "Botswana": "BW",
    "Brazil": "BR",
    "British Virgin Islands": "VG",
    "Brunei": "BN",
    "Bulgaria": "BG",
    "Burundi": "BI",
    "Cabo Verde": "CV",
    "Cambodia": "KH",
    "Cameroon": "CM",
    "Canada": "CA",
    "Chad": "TD",
    "Chile": "CL",
    "China": "CN",
    "Colombia": "CO",
    "Costa Rica": "CR",
    "Croatia": "HR",
    "Curacao": "CW",
    "Cyprus": "CY",
    "Czechia": "CZ",
    "Denmark": "DK",
    "Djibouti": "DJ",
    "Dominica": "DM",
    "Dominican Republic": "DO",
    "Ecuador": "EC",
    "Egypt": "EG",
    "El Salvador": "SV",
    "Estonia": "EE",
    "Eswatini": "SZ",
    "Faroe Islands": "FO",
    "Fiji": "FJ",
    "Finland": "FI",
    "France": "FR",
    "French Polynesia": "PF",
    "Gabon": "GA",
    "Gambia": "GM",
    "Georgia": "GE",
    "Germany": "DE",
    "Ghana": "GH",
    "Gibraltar": "GI",
    "Greece": "GR",
    "Greenland": "GL",
    "Grenada": "GD",
    "Guadeloupe": "GP",
    "Guam": "GU",
    "Guatemala": "GT",
    "Guinea": "GN",
    "Guyana": "GY",
    "Haiti": "HT",
    "Honduras": "HN",
    "Hong Kong": "HK",
    "Hungary": "HU",
    "Iceland": "IS",
    "India": "IN",
    "Indonesia": "ID",
    "Iran": "IR",
    "Iraq": "IQ",
    "Ireland": "IE",
    "Israel": "IL",
    "Italy": "IT",
    "Ivory Coast": "CI",  # Côte d’Ivoire
    "Jamaica": "JM",
    "Japan": "JP",
    "Jordan": "JO",
    "Kazakhstan": "KZ",
    "Kenya": "KE",
    "Kiribati": "KI",
    "Kosovo": "XK",  # неофициальный, но широко используемый код
    "Kuwait": "KW",
    "Kyrgyzstan": "KG",
    "Latvia": "LV",
    "Lebanon": "LB",
    "Lesotho": "LS",
    "Liberia": "LR",
    "Libya": "LY",
    "Lithuania": "LT",
    "Luxembourg": "LU",
    "Macao": "MO",
    "Madagascar": "MG",
    "Malawi": "MW",
    "Malaysia": "MY",
    "Maldives": "MV",
    "Malta": "MT",
    "Mauritania": "MR",
    "Mauritius": "MU",
    "Mexico": "MX",
    "Moldova": "MD",
    "Monaco": "MC",
    "Mongolia": "MN",
    "Montenegro": "ME",
    "Morocco": "MA",
    "Mozambique": "MZ",
    "Myanmar": "MM",
    "Namibia": "NA",
    "Nepal": "NP",
    "Netherlands": "NL",
    "New Zealand": "NZ",
    "Nicaragua": "NI",
    "Nigeria": "NG",
    "North Macedonia": "MK",
    "Northern Mariana Islands": "MP",
    "Norway": "NO",
    "Oman": "OM",
    "Pakistan": "PK",
    "Palestinian Territory": "PS",
    "Panama": "PA",
    "Papua New Guinea": "PG",
    "Peru": "PE",
    "Philippines": "PH",
    "Poland": "PL",
    "Portugal": "PT",
    "Puerto Rico": "PR",
    "Qatar": "QA",
    "Republic of the Congo": "CG",
    "Reunion": "RE",
    "Romania": "RO",
    "Russia": "RU",
    "Rwanda": "RW",
    "Saint Kitts and Nevis": "KN",
    "Saint Lucia": "LC",
    "Saint Vincent and the Grenadines": "VC",
    "Samoa": "WS",
    "Saudi Arabia": "SA",
    "Senegal": "SN",
    "Serbia": "RS",
    "Seychelles": "SC",
    "Sierra Leone": "SL",
    "Singapore": "SG",
    "Sint Maarten": "SX",
    "Slovakia": "SK",
    "Slovenia": "SI",
    "Solomon Islands": "SB",
    "Somalia": "SO",
    "South Africa": "ZA",
    "South Korea": "KR",
    "Spain": "ES",
    "Sri Lanka": "LK",
    "Suriname": "SR",
    "Sweden": "SE",
    "Switzerland": "CH",
    "Taiwan": "TW",
    "Tajikistan": "TJ",
    "Tanzania": "TZ",
    "Thailand": "TH",
    "The Netherlands": "NL",
    "Timor Leste": "TL",
    "Togo": "TG",
    "Tonga": "TO",
    "Tunisia": "TN",
    "Turkey": "TR",
    "Turks and Caicos Islands": "TC",
    "U.S. Virgin Islands": "VI",
    "Uganda": "UG",
    "Ukraine": "UA",
    "United Arab Emirates": "AE",
    "United Kingdom": "GB",
    "United States": "US",
    "Uruguay": "UY",
    "Uzbekistan": "UZ",
    "Vanuatu": "VU",
    "Venezuela": "VE",
    "Vietnam": "VN",
    "Zambia": "ZM",
    "Zimbabwe": "ZW",
}

# Применяем словарь к обоим датафреймам перед экспортом в Meta
upload_df_order_value['country'] = (
    upload_df_order_value['country']
    .map(country_to_iso)
    .fillna("")        # на случай, если вдруг попадётся значение вне словаря
)

upload_df_score['country'] = (
    upload_df_score['country']
    .map(country_to_iso)
    .fillna("")
)

print(upload_df_order_value['country'].unique()[:50])
print(upload_df_score['country'].unique()[:50])


['UA' 'US' '' 'NG' 'GH' 'IN' 'JP' 'CA' 'ES' 'EG' 'PL' 'LK' 'GB' 'ID' 'DE'
 'PK' 'RO' 'VN' 'KZ' 'SE' 'AE' 'KE' 'JM' 'PE' 'PT' 'LC' 'MA' 'NO' 'IT'
 'ZA' 'AU' 'MU' 'PH' 'TW' 'FR' 'CO' 'QA' 'MX' 'GU' 'PR' 'IE' 'DK' 'NP'
 'BE' 'BH' 'IS' 'IQ' 'CY' 'NL' 'SK']
['UA' 'US' '' 'NG' 'GH' 'IN' 'JP' 'CA' 'ES' 'EG' 'PL' 'LK' 'GB' 'ID' 'DE'
 'PK' 'RO' 'VN' 'KZ' 'SE' 'AE' 'KE' 'JM' 'PE' 'PT' 'LC' 'MA' 'NO' 'IT'
 'ZA' 'AU' 'MU' 'PH' 'TW' 'FR' 'CO' 'QA' 'MX' 'GU' 'PR' 'IE' 'DK' 'NP'
 'BE' 'BH' 'IS' 'IQ' 'CY' 'NL' 'SK']


In [None]:
import numpy as np
import pandas as pd

# 1. Собираем fn и ln по правилу:
#    fn: firstName, если не пустой, иначе segFirstName
#    ln: lastName,  если не пустой, иначе segLastName

df_merged['fn'] = np.where(
    df_merged['firstName'].astype(str).str.strip() != "",
    df_merged['firstName'].astype(str).str.strip(),
    df_merged['segFirstName'].astype(str).str.strip()
)

df_merged['ln'] = np.where(
    df_merged['lastName'].astype(str).str.strip() != "",
    df_merged['lastName'].astype(str).str.strip(),
    df_merged['segLastName'].astype(str).str.strip()
)

# 2. Базовый набор колонок, общий для обоих датафреймов
base_cols = {
    'email':   df_merged['bgEmail'].astype(str).str.strip(),
    'fn':      df_merged['fn'],
    'ln':      df_merged['ln'],
    'st':      df_merged['bgCity'].astype(str).str.strip(),
    'country': df_merged['bgCountry'].astype(str).str.strip()
}

# 3. upload_df_order_value: value = value
upload_df_order_value = pd.DataFrame(base_cols)
upload_df_order_value['value'] = df_merged['value']

# 4. upload_df_score: value = rfmScore
upload_df_score = pd.DataFrame(base_cols)
upload_df_score['value'] = df_merged['rfmScore']

# 5. (рекомендация) выкинуть строки без email — Meta их всё равно не использует
upload_df_order_value = upload_df_order_value[upload_df_order_value['email'] != ""].reset_index(drop=True)
upload_df_score       = upload_df_score[upload_df_score['email']   != ""].reset_index(drop=True)

# Быстрая проверка
print("upload_df_order_value:", upload_df_order_value.shape)
print("upload_df_score:", upload_df_score.shape)

upload_df_order_value.head(), upload_df_score.head()


upload_df_order_value: (83363, 6)
upload_df_score: (83363, 6)


(                        email fn ln       st        country value
 0        solonska.a@gmail.com          Rivne        Ukraine  0.00
 1            olya9k@gmail.com          Rivne        Ukraine  0.00
 2  danielsoloshenko@gmail.com           Kyiv        Ukraine  0.00
 3     cadiji4652@brandoza.com          Rivne        Ukraine  0.00
 4      maxym.kobzar@gmail.com        Trenton  United States  0.00,
                         email fn ln       st        country  value
 0        solonska.a@gmail.com          Rivne        Ukraine      0
 1            olya9k@gmail.com          Rivne        Ukraine      0
 2  danielsoloshenko@gmail.com           Kyiv        Ukraine      0
 3     cadiji4652@brandoza.com          Rivne        Ukraine      0
 4      maxym.kobzar@gmail.com        Trenton  United States      0)

In [None]:
# Уникальные значения стран из BigQuery
uniq_bq = df_merged['bgCountry'].dropna().unique()

# Уникальные значения стран из сегментации
uniq_seg = df_merged['segCountry'].dropna().unique()

# Объединённый набор всех вариантов
uniq_all_countries = sorted(set(list(uniq_bq) + list(uniq_seg)))

uniq_all_countries
for c in uniq_all_countries:
    print(repr(c))


''
'Albania'
'Algeria'
'American Samoa'
'Andorra'
'Angola'
'Anguilla'
'Antigua and Barbuda'
'Argentina'
'Armenia'
'Australia'
'Austria'
'Azerbaijan'
'Bahamas'
'Bahrain'
'Barbados'
'Belarus'
'Belgium'
'Belize'
'Bhutan'
'Bolivia'
'Bosnia and Herzegovina'
'Botswana'
'Brazil'
'British Virgin Islands'
'Brunei'
'Bulgaria'
'Burundi'
'Cabo Verde'
'Cambodia'
'Cameroon'
'Canada'
'Chad'
'Chile'
'China'
'Colombia'
'Costa Rica'
'Croatia'
'Curacao'
'Cyprus'
'Czechia'
'Denmark'
'Djibouti'
'Dominica'
'Dominican Republic'
'Ecuador'
'Egypt'
'El Salvador'
'Estonia'
'Eswatini'
'Faroe Islands'
'Fiji'
'Finland'
'France'
'French Polynesia'
'Gabon'
'Gambia'
'Georgia'
'Germany'
'Ghana'
'Gibraltar'
'Greece'
'Greenland'
'Grenada'
'Guadeloupe'
'Guam'
'Guatemala'
'Guinea'
'Guyana'
'Haiti'
'Honduras'
'Hong Kong'
'Hungary'
'Iceland'
'India'
'Indonesia'
'Iran'
'Iraq'
'Ireland'
'Israel'
'Italy'
'Ivory Coast'
'Jamaica'
'Japan'
'Jordan'
'Kazakhstan'
'Kenya'
'Kiribati'
'Kosovo'
'Kuwait'
'Kyrgyzstan'
'Latvia'
'Lebanon'
'L

Выгрузка в CSV

In [None]:
import numpy as np
import pandas as pd
from google.colab import files

# 0. Словарь ISO кодов
country_to_iso = {
    "": "",
    "Albania": "AL",
    "Algeria": "DZ",
    "American Samoa": "AS",
    "Andorra": "AD",
    "Angola": "AO",
    "Anguilla": "AI",
    "Antigua and Barbuda": "AG",
    "Argentina": "AR",
    "Armenia": "AM",
    "Australia": "AU",
    "Austria": "AT",
    "Azerbaijan": "AZ",
    "Bahamas": "BS",
    "Bahrain": "BH",
    "Barbados": "BB",
    "Belarus": "BY",
    "Belgium": "BE",
    "Belize": "BZ",
    "Bhutan": "BT",
    "Bolivia": "BO",
    "Bosnia and Herzegovina": "BA",
    "Botswana": "BW",
    "Brazil": "BR",
    "British Virgin Islands": "VG",
    "Brunei": "BN",
    "Bulgaria": "BG",
    "Burundi": "BI",
    "Cabo Verde": "CV",
    "Cambodia": "KH",
    "Cameroon": "CM",
    "Canada": "CA",
    "Chad": "TD",
    "Chile": "CL",
    "China": "CN",
    "Colombia": "CO",
    "Costa Rica": "CR",
    "Croatia": "HR",
    "Curacao": "CW",
    "Cyprus": "CY",
    "Czechia": "CZ",
    "Denmark": "DK",
    "Djibouti": "DJ",
    "Dominica": "DM",
    "Dominican Republic": "DO",
    "Ecuador": "EC",
    "Egypt": "EG",
    "El Salvador": "SV",
    "Estonia": "EE",
    "Eswatini": "SZ",
    "Faroe Islands": "FO",
    "Fiji": "FJ",
    "Finland": "FI",
    "France": "FR",
    "French Polynesia": "PF",
    "Gabon": "GA",
    "Gambia": "GM",
    "Georgia": "GE",
    "Germany": "DE",
    "Ghana": "GH",
    "Gibraltar": "GI",
    "Greece": "GR",
    "Greenland": "GL",
    "Grenada": "GD",
    "Guadeloupe": "GP",
    "Guam": "GU",
    "Guatemala": "GT",
    "Guinea": "GN",
    "Guyana": "GY",
    "Haiti": "HT",
    "Honduras": "HN",
    "Hong Kong": "HK",
    "Hungary": "HU",
    "Iceland": "IS",
    "India": "IN",
    "Indonesia": "ID",
    "Iran": "IR",
    "Iraq": "IQ",
    "Ireland": "IE",
    "Israel": "IL",
    "Italy": "IT",
    "Ivory Coast": "CI",
    "Jamaica": "JM",
    "Japan": "JP",
    "Jordan": "JO",
    "Kazakhstan": "KZ",
    "Kenya": "KE",
    "Kiribati": "KI",
    "Kosovo": "XK",
    "Kuwait": "KW",
    "Kyrgyzstan": "KG",
    "Latvia": "LV",
    "Lebanon": "LB",
    "Lesotho": "LS",
    "Liberia": "LR",
    "Libya": "LY",
    "Lithuania": "LT",
    "Luxembourg": "LU",
    "Macao": "MO",
    "Madagascar": "MG",
    "Malawi": "MW",
    "Malaysia": "MY",
    "Maldives": "MV",
    "Malta": "MT",
    "Mauritania": "MR",
    "Mauritius": "MU",
    "Mexico": "MX",
    "Moldova": "MD",
    "Monaco": "MC",
    "Mongolia": "MN",
    "Montenegro": "ME",
    "Morocco": "MA",
    "Mozambique": "MZ",
    "Myanmar": "MM",
    "Namibia": "NA",
    "Nepal": "NP",
    "Netherlands": "NL",
    "New Zealand": "NZ",
    "Nicaragua": "NI",
    "Nigeria": "NG",
    "North Macedonia": "MK",
    "Northern Mariana Islands": "MP",
    "Norway": "NO",
    "Oman": "OM",
    "Pakistan": "PK",
    "Palestinian Territory": "PS",
    "Panama": "PA",
    "Papua New Guinea": "PG",
    "Peru": "PE",
    "Philippines": "PH",
    "Poland": "PL",
    "Portugal": "PT",
    "Puerto Rico": "PR",
    "Qatar": "QA",
    "Republic of the Congo": "CG",
    "Reunion": "RE",
    "Romania": "RO",
    "Russia": "RU",
    "Rwanda": "RW",
    "Saint Kitts and Nevis": "KN",
    "Saint Lucia": "LC",
    "Saint Vincent and the Grenadines": "VC",
    "Samoa": "WS",
    "Saudi Arabia": "SA",
    "Senegal": "SN",
    "Serbia": "RS",
    "Seychelles": "SC",
    "Sierra Leone": "SL",
    "Singapore": "SG",
    "Sint Maarten": "SX",
    "Slovakia": "SK",
    "Slovenia": "SI",
    "Solomon Islands": "SB",
    "Somalia": "SO",
    "South Africa": "ZA",
    "South Korea": "KR",
    "Spain": "ES",
    "Sri Lanka": "LK",
    "Suriname": "SR",
    "Sweden": "SE",
    "Switzerland": "CH",
    "Taiwan": "TW",
    "Tajikistan": "TJ",
    "Tanzania": "TZ",
    "Thailand": "TH",
    "The Netherlands": "NL",
    "Timor Leste": "TL",
    "Togo": "TG",
    "Tonga": "TO",
    "Tunisia": "TN",
    "Turkey": "TR",
    "Turks and Caicos Islands": "TC",
    "U.S. Virgin Islands": "VI",
    "Uganda": "UG",
    "Ukraine": "UA",
    "United Arab Emirates": "AE",
    "United Kingdom": "GB",
    "United States": "US",
    "Uruguay": "UY",
    "Uzbekistan": "UZ",
    "Vanuatu": "VU",
    "Venezuela": "VE",
    "Vietnam": "VN",
    "Zambia": "ZM",
    "Zimbabwe": "ZW",
}

# 1. fn / ln: берём из bq, если не пустые, иначе из сегментации
df_merged['fn'] = np.where(
    df_merged['firstName'].astype(str).str.strip() != "",
    df_merged['firstName'].astype(str).str.strip(),
    df_merged['segFirstName'].astype(str).str.strip()
)

df_merged['ln'] = np.where(
    df_merged['lastName'].astype(str).str.strip() != "",
    df_merged['lastName'].astype(str).str.strip(),
    df_merged['segLastName'].astype(str).str.strip()
)

# 2. Нормализуем value и rfmScore
df_merged['value'] = pd.to_numeric(df_merged['value'], errors='coerce').fillna(0).round(2)
df_merged['rfmScore'] = pd.to_numeric(df_merged['rfmScore'], errors='coerce').fillna(0).round(2)

# 3. Базовые поля
base_cols = {
    'email':   df_merged['bgEmail'].astype(str).str.strip(),
    'fn':      df_merged['fn'],
    'ln':      df_merged['ln'],
    'st':      df_merged['bgCity'].astype(str).str.strip(),
    'country': df_merged['bgCountry'].astype(str).str.strip(),
}

# 4. Датафреймы под Meta
upload_df_order_value = pd.DataFrame(base_cols)
upload_df_order_value['value'] = df_merged['value']

upload_df_score = pd.DataFrame(base_cols)
upload_df_score['value'] = df_merged['rfmScore']

# 5. Конвертация country -> ISO-2
for df in (upload_df_order_value, upload_df_score):
    df['country'] = df['country'].map(country_to_iso).fillna("")

# 6. Убираем записи без email
upload_df_order_value = upload_df_order_value[upload_df_order_value['email'] != ""].reset_index(drop=True)
upload_df_score       = upload_df_score[upload_df_score['email']   != ""].reset_index(drop=True)

print(upload_df_order_value.head())
print(upload_df_order_value['country'].unique()[:20])

# 7. Переименовываем колонки в формат, понятный Meta
rename_map = {
    'email': 'EMAIL',
    'fn': 'FN',
    'ln': 'LN',
    'st': 'CT',          # city
    'country': 'COUNTRY',
    'value': 'VALUE'
}

order_value_export = upload_df_order_value.rename(columns=rename_map)
score_export       = upload_df_score.rename(columns=rename_map)

# 8. Сохраняем в CSV
order_file = "meta_order_value.csv"
score_file = "meta_rfm_score.csv"

order_value_export.to_csv(order_file, index=False)
score_export.to_csv(score_file, index=False)

print("order_value_export shape:", order_value_export.shape)
print("score_export shape:", score_export.shape)

# 9. Скачиваем файлы в Colab
files.download(order_file)
files.download(score_file)


                        email fn ln       st country  value
0        solonska.a@gmail.com          Rivne      UA    0.0
1            olya9k@gmail.com          Rivne      UA    0.0
2  danielsoloshenko@gmail.com           Kyiv      UA    0.0
3     cadiji4652@brandoza.com          Rivne      UA    0.0
4      maxym.kobzar@gmail.com        Trenton      US    0.0
['UA' 'US' '' 'NG' 'GH' 'IN' 'JP' 'CA' 'ES' 'EG' 'PL' 'LK' 'GB' 'ID' 'DE'
 'PK' 'RO' 'VN' 'KZ' 'SE']
order_value_export shape: (83363, 6)
score_export shape: (83363, 6)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
df_merged.to_csv("merged_customers.csv", index=False)

from google.colab import files
files.download("merged_customers.csv")
