pd.set_option("display.max_rows", None) # 행이 많아도 자르지 않음

## 데이터 상태에 따른 올바른 연결 고리:
- 현재 : city_map['city_std']: "Maharashtra" (주 이름)
- 연결 1: "Maharashtra" $\rightarrow$ state_to_std_map 적용 $\rightarrow$ "mumbai" (대표 관측 도시)
    - => ***city_map***
- 연결 2: "mumbai" $\rightarrow$ station_latlon 병합 $\rightarrow$ 19.0760, 72.8777 (위경도)
    - => ***weather_df***


In [313]:
import pandas as pd

# -------------------------------------------------------------------
df = pd.read_csv("../blinkit-dataset/blinkit_master_data.csv")

df["area"].value_counts().reset_index()

Unnamed: 0,area,count
0,Orai,44
1,Deoghar,40
2,Gandhinagar,37
3,Nandyal,36
4,Ratlam,35
...,...,...
311,Cuttack,4
312,Rourkela,4
313,Raichur,3
314,Bokaro,2


In [314]:
df.columns

Index(['order_id', 'customer_id_x', 'order_date', 'promised_delivery_time',
       'actual_delivery_time', 'delivery_status_x', 'order_total',
       'payment_method', 'delivery_partner_id_x', 'store_id', 'product_id',
       'quantity', 'unit_price', 'product_name', 'category', 'brand', 'price',
       'mrp', 'margin_percentage', 'shelf_life_days', 'min_stock_level',
       'max_stock_level', 'delivery_partner_id_y', 'promised_time',
       'actual_time', 'delivery_time_minutes', 'distance_km',
       'delivery_status_y', 'reasons_if_delayed', 'feedback_id',
       'customer_id_y', 'rating', 'feedback_text', 'feedback_category',
       'sentiment', 'feedback_date', 'customer_name', 'email', 'phone',
       'address', 'area', 'pincode', 'registration_date', 'customer_segment',
       'total_orders', 'avg_order_value'],
      dtype='object')

***현재 city 값들부터 전체 점검***

In [316]:
df["area"].value_counts().reset_index().rename(
    columns={"area": "raw_city"}
)


Unnamed: 0,raw_city,count
0,Orai,44
1,Deoghar,40
2,Gandhinagar,37
3,Nandyal,36
4,Ratlam,35
...,...,...
311,Cuttack,4
312,Rourkela,4
313,Raichur,3
314,Bokaro,2


***기본 정규화 작업(소문자, 공백, 특수 문자 제거)***

In [320]:
df["city_norm"] = (
    df["area"]
    .str.lower()
    .str.strip()
    .str.replace(r"[^\w\s]", "", regex=True)
)

df["city_norm"]

0        allahabad
1         thrissur
2          vellore
3             gaya
4          asansol
           ...    
4995       udaipur
4996       mathura
4997    jamshedpur
4998       chennai
4999      jamnagar
Name: city_norm, Length: 5000, dtype: object

***city 표준 매핑 딕셔너리 만들기(광역권 기준)***

In [None]:
metro_to_cities = {
    "Andhra Pradesh": [
        "vijayawada", "ongole", "adoni", "narasaraopet", "hindupur", "chittoor", 
        "rajahmundry", "nandyal", "kakinada", "guntur", "gudivada", "visakhapatnam", 
        "vijayanagaram", "eluru", "nellore", "amaravati", "tadipatri", "kavali", 
        "tenali", "bhimavaram", "kadapa", "proddatur", "anantapur", "tadepalligudem", 
        "kurnool", "machilipatnam", "madanapalle", "guntakal", "anantapuram", 
        "srikakulam", "tirupati", "dharmavaram"
    ],
    "Bihar": [
        "gaya", "dehri", "muzaffarpur", "begusarai", "siwan", "darbhanga", 
        "bihar sharif", "patna", "bettiah", "arrah", "danapur", "hajipur", 
        "saharsa", "katihar", "chapra", "kishanganj", "bhagalpur", "purnia", 
        "buxar", "jehanabad", "motihari", "sasaram", "munger", "jamalpur"
    ],
    "Uttar Pradesh": [
        "allahabad", "farrukhabad", "saharanpur", "etawah", "mathura", "mirzapur", 
        "rampur", "orai", "bulandshahr", "shahjahanpur", "ghaziabad", "aligarh", 
        "agra", "jhansi", "bareilly", "fatehpur", "raebareli", "meerut", 
        "firozabad", "mau", "hapur", "muzaffarnagar", "lucknow", "kanpur", 
        "amroha", "sambhal", "jaunpur", "varanasi", "bahraich", "unnao", 
        "moradabad", "khora", "loni", "ballia", "noida"
    ],
    "Maharashtra": [
        "jalna", "akola", "ahmednagar", "panvel", "ichalkaranji", "kolhapur", 
        "dhule", "thane", "chandrapur", "latur", "satara", "nanded", 
        "mirabhayandar", "jalgaon", "pimprichinchwad", "navi mumbai", "amravati", 
        "mumbai", "kalyandombivli", "ulhasnagar", "sanglimiraj kupwad", "bhiwandi", 
        "parbhani", "nagpur", "pune", "nashik", "ambarnath", "malegaon", 
        "bhusawal", "solapur", "vasaivirar", "aurangabad"
    ],
    "Karnataka": [
        "belgaum", "davanagere", "gulbarga", "bijapur", "udupi", "bangalore", 
        "hublidharwad", "bellary", "shimoga", "bidar", "mysore", "tumkur", 
        "raichur", "hospet", "mangalore"
    ],
    "Tamil Nadu": [
        "vellore", "madurai", "hosur", "avadi", "pudukkottai", "kumbakonam", 
        "tiruppur", "tiruvottiyur", "dindigul", "thanjavur", "thoothukudi", 
        "erode", "nagercoil", "ambattur", "pallavaram", "salem", "tirunelveli", 
        "karaikudi", "tiruchirappalli", "coimbatore", "chennai"
    ],
    "West Bengal": [
        "asansol", "south dumdum", "kamarhati", "serampore", "north dumdum", 
        "maheshtala", "baranagar", "howrah", "madhyamgram", "bally", "siliguri", 
        "haldia", "panihati", "naihati", "bidhannagar", "bhatpara", 
        "rajpur sonarpur", "kolkata", "barasat", "malda", "bardhaman", "kulti", 
        "chinsurah", "kharagpur", "raiganj", "uluberia", "berhampore"
    ],
    "Gujarat": [
        "anand", "gandhidham", "morbi", "junagadh", "jamnagar", "bhavnagar", 
        "surat", "mehsana", "vadodara", "ahmedabad", "surendranagar dudhrej", 
        "rajkot", "gandhinagar", "nadiad"
    ],
    "Madhya Pradesh": [
        "khandwa", "rewa", "ratlam", "burhanpur", "jabalpur", "shivpuri", 
        "bhopal", "bhind", "gwalior", "dewas", "singrauli", "morena", "sagar", 
        "indore", "guna", "ujjain", "katni", "satna"
    ],
    "Rajasthan": [
        "pali", "bikaner", "udaipur", "sri ganganagar", "sikar", "jaipur", 
        "alwar", "ajmer", "bhilwara", "jodhpur", "bharatpur", "kota"
    ],
    "Telangana": [
        "nizamabad", "khammam", "karimnagar", "ramagundam", "mahbubnagar", 
        "warangal", "hyderabad", "secunderabad", "suryapet", "miryalaguda"
    ],
    "Odisha": [
        "gopalpur", "berhampur", "bhubaneswar", "sambalpur", "cuttack", 
        "rourkela", "raurkela industrial township"
    ],
    "Jharkhand": [
        "medininagar", "bokaro", "deoghar", "ranchi", "ramgarh", "hazaribagh", 
        "jamshedpur", "giridih", "mango", "dhanbad", "phusro"
    ],
    "Punjab": [
        "phagwara", "bathinda", "jalandhar", "ludhiana", "patiala", "amritsar"
    ],
    "Haryana": [
        "sirsa", "rohtak", "gurgaon", "panipat", "bhiwani", "karnal", "ambala", 
        "sonipat", "panchkula", "yamunanagar", "faridabad"
    ],
    "Chhattisgarh": [
        "durg", "bilaspur", "korba", "bhilai", "raipur"
    ],
    "Kerala": [
        "thrissur", "alappuzha", "kottayam", "kollam", "kochi", "kozhikode", 
        "thiruvananthapuram"
    ],
    "Assam": [
        "dibrugarh", "tinsukia", "guwahati", "tezpur", "nagaon", "silchar", 
        "bongaigaon", "jorhat"
    ],
    "Delhi": [
        "delhi", "new delhi", "karawal nagar", "nangloi jat", 
        "bhalswa jahangir pur", "sultan pur majra", "kirari suleman nagar"
    ],
    "Other States": [
        "pondicherry", "ozhukarai", "chandigarh", "agartala", "aizawl", 
        "imphal", "gangtok", "shimla", "jammu", "srinagar", "haridwar", "dehradun"
    ]
}

city_map = {
    city: metro
    for metro, cities in metro_to_cities.items()
    for city in cities
}


# 맵핑 안 된 도시들은 그대로 두고, 맵핑된 광역권 이름 표시(예, delhi, chennai 등)
df["city_std"] = df["city_norm"].replace(city_map)

In [322]:
df.columns

Index(['order_id', 'customer_id_x', 'order_date', 'promised_delivery_time',
       'actual_delivery_time', 'delivery_status_x', 'order_total',
       'payment_method', 'delivery_partner_id_x', 'store_id', 'product_id',
       'quantity', 'unit_price', 'product_name', 'category', 'brand', 'price',
       'mrp', 'margin_percentage', 'shelf_life_days', 'min_stock_level',
       'max_stock_level', 'delivery_partner_id_y', 'promised_time',
       'actual_time', 'delivery_time_minutes', 'distance_km',
       'delivery_status_y', 'reasons_if_delayed', 'feedback_id',
       'customer_id_y', 'rating', 'feedback_text', 'feedback_category',
       'sentiment', 'feedback_date', 'customer_name', 'email', 'phone',
       'address', 'area', 'pincode', 'registration_date', 'customer_segment',
       'total_orders', 'avg_order_value', 'city_norm', 'city_std'],
      dtype='object')

---

***위도/경도 맵핑하기***

In [323]:
# 주(State)별 소도시 리스트 정의
state_to_cities = {
    "Andhra Pradesh": 
    ["vijayawada", "ongole", "adoni", "narasaraopet", "hindupur", "chittoor", "rajahmundry", "nandyal", "kakinada", "guntur", "gudivada", "visakhapatnam", "vijayanagaram", "eluru", "nellore", "amaravati", "tadipatri", "kavali", "tenali", "bhimavaram", "kadapa", "proddatur", "anantapur", "tadepalligudem", "kurnool", "machilipatnam", "madanapalle", "guntakal", "anantapuram", "srikakulam", "tirupati", "dharmavaram"],
    
    "Bihar": 
    ["gaya", "dehri", "muzaffarpur", "begusarai", "siwan", "darbhanga", "bihar sharif", "patna", "bettiah", "arrah", "danapur", "hajipur", "saharsa", "katihar", "chapra", "kishanganj", "bhagalpur", "purnia", "buxar", "jehanabad", "motihari", "sasaram", "munger", "jamalpur"],

    "Uttar Pradesh": 
    ["allahabad", "farrukhabad", "saharanpur", "etawah", "mathura", "mirzapur", "rampur", "orai", "bulandshahr", "shahjahanpur", "ghaziabad", "aligarh", "agra", "jhansi", "bareilly", "fatehpur", "raebareli", "meerut", "firozabad", "mau", "hapur", "muzaffarnagar", "lucknow", "kanpur", "amroha", "sambhal", "jaunpur", "varanasi", "bahraich", "unnao", "moradabad", "khora", "loni", "ballia", "noida"],

    "Maharashtra": 
    ["jalna", "akola", "ahmednagar", "panvel", "ichalkaranji", "kolhapur", "dhule", "thane", "chandrapur", "latur", "satara", "nanded", "mirabhayandar", "jalgaon", "pimprichinchwad", "navi mumbai", "amravati", "mumbai", "kalyandombivli", "ulhasnagar", "sanglimiraj kupwad", "bhiwandi", "parbhani", "nagpur", "pune", "nashik", "ambarnath", "malegaon", "bhusawal", "solapur", "vasaivirar", "aurangabad"],

    "Karnataka": 
    ["belgaum", "davanagere", "gulbarga", "bijapur", "udupi", "bangalore", "hublidharwad", "bellary", "shimoga", "bidar", "mysore", "tumkur", "raichur", "hospet", "mangalore"],

    "Tamil Nadu": 
    ["vellore", "madurai", "hosur", "avadi", "pudukkottai", "kumbakonam", "tiruppur", "tiruvottiyur", "dindigul", "thanjavur", "thoothukudi", "erode", "nagercoil", "ambattur", "pallavaram", "salem", "tirunelveli", "karaikudi", "tiruchirappalli", "coimbatore", "chennai"],

    "West Bengal": 
    ["asansol", "south dumdum", "kamarhati", "serampore", "north dumdum", "maheshtala", "baranagar", "howrah", "madhyamgram", "bally", "siliguri", "haldia", "panihati", "naihati", "bidhannagar", "bhatpara", "rajpur sonarpur", "kolkata", "barasat", "malda", "bardhaman", "kulti", "chinsurah", "kharagpur", "raiganj", "uluberia", "berhampore"],

    "Gujarat": 
    ["anand", "gandhidham", "morbi", "junagadh", "jamnagar", "bhavnagar", "surat", "mehsana", "vadodara", "ahmedabad", "surendranagar dudhrej", "rajkot", "gandhinagar", "nadiad"],

    "Madhya Pradesh": 
    ["khandwa", "rewa", "ratlam", "burhanpur", "jabalpur", "shivpuri", "bhopal", "bhind", "gwalior", "dewas", "singrauli", "morena", "sagar", "indore", "guna", "ujjain", "katni", "satna"],

    "Rajasthan": 
    ["pali", "bikaner", "udaipur", "sri ganganagar", "sikar", "jaipur", "alwar", "ajmer", "bhilwara", "jodhpur", "bharatpur", "kota"],

    "Telangana": 
    ["nizamabad", "khammam", "karimnagar", "ramagundam", "mahbubnagar", "warangal", "hyderabad", "secunderabad", "suryapet", "miryalaguda"],

    "Odisha": 
    ["gopalpur", "berhampur", "bhubaneswar", "sambalpur", "cuttack", "rourkela", "raurkela industrial township"],

    "Jharkhand": 
    ["medininagar", "bokaro", "deoghar", "ranchi", "ramgarh", "hazaribagh", "jamshedpur", "giridih", "mango", "dhanbad", "phusro"],

    "Punjab": 
    ["phagwara", "bathinda", "jalandhar", "ludhiana", "patiala", "amritsar"],

    "Haryana": 
    ["sirsa", "rohtak", "gurgaon", "panipat", "bhiwani", "karnal", "ambala", "sonipat", "panchkula", "yamunanagar", "faridabad"],

    "Chhattisgarh": 
    ["durg", "bilaspur", "korba", "bhilai", "raipur"],

    "Kerala": 
    ["thrissur", "alappuzha", "kottayam", "kollam", "kochi", "kozhikode", "thiruvananthapuram"],

    "Assam": 
    ["dibrugarh", "tinsukia", "guwahati", "tezpur", "nagaon", "silchar", "bongaigaon", "jorhat"],
    
    "Delhi": 
    ["delhi", "new delhi", "karawal nagar", "nangloi jat", "bhalswa jahangir pur", "sultan pur majra", "kirari suleman nagar"],

    "Other States": 
    ["pondicherry", "ozhukarai", "chandigarh", "agartala", "aizawl", "imphal", "gangtok", "shimla", "jammu", "srinagar", "haridwar", "dehradun"]
}

# 메모리에 dict로 남아있는 city_map을 강제로 초기화
city_map = df[['area', 'city_norm']].drop_duplicates().copy()

# 역매핑 딕셔너리 생성 (city_std를 위한 전단계)
city_to_state_dict = {city: state for state, cities in state_to_cities.items() for city in cities}

# 데이터프레임 컬럼 생성
city_map['city_std'] = city_map['city_norm'].map(city_to_state_dict)

# 주(State)별 대표 기상 관측 도시(city_std) 설정
state_to_std_map = {
    "Andhra Pradesh": "hyderabad", "Telangana": "hyderabad", "Bihar": "patna",
    "Uttar Pradesh": "lucknow", "Maharashtra": "mumbai", "Karnataka": "bangalore",
    "Tamil Nadu": "chennai", "West Bengal": "kolkata", "Gujarat": "ahmedabad",
    "Madhya Pradesh": "bhopal", "Rajasthan": "jaipur", "Odisha": "bhubaneswar",
    "Jharkhand": "ranchi", "Punjab": "chandigarh", "Haryana": "chandigarh",
    "Delhi": "delhi", "Chhattisgarh": "raipur", "Kerala": "kochi",
    "Assam": "guwahati", "Other States": "delhi"
}

# 대표 도시 위경도 데이터 프레임
station_latlon = pd.DataFrame([
    {"city_std": "delhi", "lat": 28.6139, "lon": 77.2090},
    {"city_std": "mumbai", "lat": 19.0760, "lon": 72.8777},
    {"city_std": "bangalore", "lat": 12.9716, "lon": 77.5946},
    {"city_std": "hyderabad", "lat": 17.3850, "lon": 78.4867},
    {"city_std": "chennai", "lat": 13.0827, "lon": 80.2707},
    {"city_std": "kolkata", "lat": 22.5726, "lon": 88.3639},
    {"city_std": "ahmedabad", "lat": 23.0225, "lon": 72.5714},
    {"city_std": "pune", "lat": 18.5204, "lon": 73.8567},
    {"city_std": "jaipur", "lat": 26.9124, "lon": 75.7873},
    {"city_std": "chandigarh", "lat": 30.7333, "lon": 76.7794},
    {"city_std": "lucknow", "lat": 26.8467, "lon": 80.9462},
    {"city_std": "patna", "lat": 25.5941, "lon": 85.1376},
    {"city_std": "guwahati", "lat": 26.1445, "lon": 91.7362},
    {"city_std": "bhopal", "lat": 23.2599, "lon": 77.4126},
    {"city_std": "kochi", "lat": 9.9312, "lon": 76.2673},
    {"city_std": "bhubaneswar", "lat": 20.2961, "lon": 85.8245},
    {"city_std": "raipur", "lat": 21.2514, "lon": 81.6296},
    {"city_std": "ranchi", "lat": 23.3441, "lon": 85.3094},
])

# city_map에 병합 수행
city_map['weather_station_city'] = city_map['city_std'].map(state_to_std_map) # 주 기준 대표도시 할당

# 위경도 정보 최종 병합
city_map = city_map.merge(
    station_latlon,
    left_on="weather_station_city",
    right_on="city_std",
    how="left",
    suffixes=('', '_station') # city_std -> city_std_station으로 중복 방지
    )

# 불필요한 중복 컬럼 제거 (city_std가 두 번 들어갈 수 있음)
if 'city_std_station' in city_map.columns:
    city_map = city_map.drop(columns=['city_std_station'])

# 확인
print("--- 병합 결과 샘플 ---")
display(city_map[['city_std', 'weather_station_city', 'lat', 'lon']].drop_duplicates())

# 결측치 체크 (위경도가 없는 곳이 있다면 std_map 설정 오류)
print(f"\n위경도 결측치 개수: {city_map['lat'].isna().sum()}")

--- 병합 결과 샘플 ---


Unnamed: 0,city_std,weather_station_city,lat,lon
0,Uttar Pradesh,lucknow,26.8467,80.9462
1,Kerala,kochi,9.9312,76.2673
2,Tamil Nadu,chennai,13.0827,80.2707
3,Bihar,patna,25.5941,85.1376
4,West Bengal,kolkata,22.5726,88.3639
5,Odisha,bhubaneswar,20.2961,85.8245
6,Punjab,chandigarh,30.7333,76.7794
9,Haryana,chandigarh,30.7333,76.7794
12,Andhra Pradesh,hyderabad,17.385,78.4867
13,Karnataka,bangalore,12.9716,77.5946



위경도 결측치 개수: 3


In [328]:
city_map.columns

Index(['area', 'city_norm', 'city_std', 'weather_station_city', 'lat', 'lon'], dtype='object')

In [None]:
# # columns 파라미터로 특정 컬럼 삭제하기
# city_map = city_map.drop(columns=['lat_x', 'lon_x', 'lat_y', 'lon_y'])

---

***Open-Meteo로 주(state) 날씨 데이터 수집 코드***

In [336]:
import requests

def fetch_weather_master(city_map_df, start_date, end_date):
    # station_latlon의 컬럼명인 city_std, lat, lon을 사용하도록 수정
    stations = city_map_df[['city_std', 'lat', 'lon']].drop_duplicates().dropna()
    all_weather = []
    
    for _, row in stations.iterrows():
        url = "https://archive-api.open-meteo.com/v1/archive"
        params = {
            "latitude": row['lat'], 
            "longitude": row['lon'],
            "start_date": start_date, 
            "end_date": end_date,
            "daily": ["temperature_2m_max", "temperature_2m_min", "precipitation_sum"],
            "timezone": "Asia/Kolkata"
        }
        try:
            r = requests.get(url, params=params)
            r.raise_for_status()
            data = r.json()
            
            temp_df = pd.DataFrame({
                "order_date": data["daily"]["time"],
                "city_std": row['city_std'],
                "temp_max(최고 기온)": data["daily"]["temperature_2m_max"],
                "temp_min(최저 기온)": data["daily"]["temperature_2m_min"],
                "rain_pre(강수량 합계)": data["daily"]["precipitation_sum"]
            })
            all_weather.append(temp_df)
            print(f"수집 완료: {row['city_std']}")
        except Exception as e:
            print(f"수집 실패: {row['city_std']} - {e}")
            
    if not all_weather:
        return pd.DataFrame()
    return pd.concat(all_weather, ignore_index=True)

# 특정 기간 전체 날씨 수집 
weather_master = fetch_weather_master(city_map, "2023-03-01", "2024-11-30")

수집 완료: Uttar Pradesh
수집 완료: Kerala
수집 완료: Tamil Nadu
수집 완료: Bihar
수집 완료: West Bengal
수집 완료: Odisha
수집 완료: Punjab
수집 완료: Haryana
수집 완료: Andhra Pradesh
수집 완료: Karnataka
수집 완료: Telangana
수집 완료: Madhya Pradesh
수집 완료: Maharashtra
수집 완료: Assam
수집 완료: Jharkhand
수집 완료: Gujarat
수집 완료: Rajasthan
수집 완료: Other States
수집 완료: Chhattisgarh
수집 완료: Delhi


***모든 city_std에 대해 날씨 수집***

In [333]:
weather_all = []

# station_latlon의 각 행을 순회
for _, row in station_latlon.iterrows():
    # 1. 함수가 city_map_df(데이터프레임)를 받으므로 한 줄짜리 DF로 변환
    temp_station_df = pd.DataFrame([row])
    
    # 2. 함수 호출
    w = fetch_weather_master(
        temp_station_df,
        "2023-03-01",
        "2024-11-30"
    )

    if not w.empty:
        # 3. 기존 변수명 유지하여 결과 리스트에 추가
        weather_all.append(w)

# 최종 합치기
weather_df = pd.concat(weather_all, ignore_index=True)


KeyError: "['weather_station_city'] not in index"

***결과 확인하기***

In [298]:
weather_df

Unnamed: 0,order_date,city_std(광역권 이름),temp_max(최고 기온),temp_min(최저 기온),rain_pre(강수량 합계)
0,2023-03-01,delhi,29.7,15.6,0.0
1,2023-03-02,delhi,30.0,15.1,0.0
2,2023-03-03,delhi,30.5,16.8,0.0
3,2023-03-04,delhi,29.8,17.5,0.0
4,2023-03-05,delhi,29.6,16.3,0.0
...,...,...,...,...,...
11533,2024-11-26,ranchi,24.0,13.8,0.0
11534,2024-11-27,ranchi,23.2,13.2,0.0
11535,2024-11-28,ranchi,23.2,13.1,0.0
11536,2024-11-29,ranchi,23.6,13.2,0.0


---

### ***수집된 데이터 병합하기***

***데이터프레임 형식 맞추기***

In [311]:
# 기본 파생 변수
weather_df["temp_range(최고 기온-최저 기온)"] = weather_df["temp_max(최고 기온)"] - weather_df["temp_min(최저 기온)"]

# 폭염/폭우 플래그
weather_df["heatwave(폭염)"] = (weather_df["temp_max(최고 기온)"] >= 38).astype(int)
weather_df["heavy_rain(폭우)"] = (weather_df["rain_pre(강수량 합계)"] >= 50).astype(int)

# 날짜를 문자열로 통일
df["order_date"] = pd.to_datetime(df["order_date"]).dt.strftime('%Y-%m-%d')
weather_df["order_date"] = pd.to_datetime(weather_df["order_date"]).dt.strftime('%Y-%m-%d')

# city_std 대소문자 및 공백 제거
df["city_std(광역권 이름)"] = df["city_std(광역권 이름)"].astype(str).str.strip().str.lower()
weather_df["city_std(광역권 이름)"] = weather_df["city_std(광역권 이름)"].astype(str).str.strip().str.lower()

# df의 city_std, city_norm 컬럼명 변경하기
df = df.rename(columns={
    "city_std": "city_std(광역권 이름)",
    "city_norm": "city_norm(정규화된 도시 이름)"
})

# csv 파일로 저장하기
weather_df.to_csv("weather_df.csv", index=False, encoding="utf-8-sig")

In [309]:
weather_df["city_std(광역권 이름)"].unique()

array(['delhi', 'mumbai', 'bangalore', 'hyderabad', 'chennai', 'kolkata',
       'ahmedabad', 'pune', 'jaipur', 'chandigarh', 'lucknow', 'patna',
       'guwahati', 'bhopal', 'kochi', 'bhubaneswar', 'raipur', 'ranchi'],
      dtype=object)

***데이터셋 Join***

In [307]:
print(df.columns)
print(weather_df.columns)

Index(['order_id', 'customer_id_x', 'order_date', 'promised_delivery_time',
       'actual_delivery_time', 'delivery_status_x', 'order_total',
       'payment_method', 'delivery_partner_id_x', 'store_id', 'product_id',
       'quantity', 'unit_price', 'product_name', 'category', 'brand', 'price',
       'mrp', 'margin_percentage', 'shelf_life_days', 'min_stock_level',
       'max_stock_level', 'delivery_partner_id_y', 'promised_time',
       'actual_time', 'delivery_time_minutes', 'distance_km',
       'delivery_status_y', 'reasons_if_delayed', 'feedback_id',
       'customer_id_y', 'rating', 'feedback_text', 'feedback_category',
       'sentiment', 'feedback_date', 'customer_name', 'email', 'phone',
       'address', 'area', 'pincode', 'registration_date', 'customer_segment',
       'total_orders', 'avg_order_value', 'city_norm(정규화된 도시 이름)',
       'city_std(광역권 이름)'],
      dtype='object')
Index(['order_date', 'city_std(광역권 이름)', 'temp_max(최고 기온)', 'temp_min(최저 기온)',
       'rain_pre

In [None]:
# # columns 파라미터로 특정 컬럼 삭제하기
# df = df.drop(columns=['customer_id_y', 'delivery_partner_id_y', 'delivery_status_y', ])

In [None]:
# # columns 파라미터로 특정 컬럼 삭제하기
# weather_df = weather_df.drop(columns=[''])

In [312]:
# 대표 도시 -> 주(state) 이름으로 바꾸기
std_to_state_map = {v: k for k, v in state_to_std_map.items()}

# weather_df의 도시 이름을 주(state) 이름으로 치환
weather_df["city_std(광역권 이름)"] = weather_df["city_std(광역권 이름)"].map(std_to_state_map)

blinkit_master_data_weather = df.merge(
    weather_df,
    on=["order_date", "city_std(광역권 이름)", ],
    how="left"
)

# csv 파일 저장
blinkit_master_data_weather.to_csv('weather//blinkit_master_data_weather_08.csv', index=False, encoding="utf-8-sig")

# NaN이 아닌 개수 체크
success_count = blinkit_master_data_weather["temp_max(최고 기온)"].notna().sum()
print(f"매칭 성공 데이터 수: {success_count} / 전체: {len(blinkit_master_data_weather)}")

# 확인
display(blinkit_master_data_weather)

매칭 성공 데이터 수: 0 / 전체: 5000


Unnamed: 0,order_id,customer_id_x,order_date,promised_delivery_time,actual_delivery_time,delivery_status_x,order_total,payment_method,delivery_partner_id_x,store_id,...,total_orders,avg_order_value,city_norm(정규화된 도시 이름),city_std(광역권 이름),temp_max(최고 기온),temp_min(최저 기온),rain_pre(강수량 합계),temp_range(최고 기온-최저 기온),heatwave(폭염),heavy_rain(폭우)
0,1961864118,30065862,2024-07-17,2024-07-17 08:52:01,2024-07-17 08:47:01,On Time,3197.07,Cash,63230,4771,...,13,749.95,allahabad,uttar pradesh,,,,,,
1,1549769649,9573071,2024-05-28,2024-05-28 13:25:29,2024-05-28 13:27:29,On Time,976.55,Cash,14983,7534,...,5,958.06,thrissur,kerala,,,,,,
2,9185164487,45477575,2024-09-23,2024-09-23 13:25:12,2024-09-23 13:29:12,On Time,839.05,UPI,39859,9886,...,4,327.93,vellore,tamil nadu,,,,,,
3,9644738826,88067569,2023-11-24,2023-11-24 16:34:56,2023-11-24 16:33:56,On Time,440.23,Card,61497,7917,...,5,273.38,gaya,bihar,,,,,,
4,5427684290,83298567,2023-11-20,2023-11-20 05:17:39,2023-11-20 05:18:39,On Time,2526.68,Cash,84315,2741,...,14,763.10,asansol,west bengal,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1669690997,62600289,2023-12-25,2023-12-25 16:05:20,2023-12-25 16:10:20,On Time,1132.33,Cash,90914,1587,...,10,1326.02,udaipur,rajasthan,,,,,,
4996,8340761903,53640286,2023-11-27,2023-11-27 09:38:43,2023-11-27 09:36:43,On Time,2372.01,Cash,27952,3458,...,20,1368.38,mathura,uttar pradesh,,,,,,
4997,5936301790,87059497,2024-06-21,2024-06-21 19:23:09,2024-06-21 19:26:09,On Time,3158.35,Cash,9590,7424,...,5,1137.61,jamshedpur,jharkhand,,,,,,
4998,5710579377,67310893,2024-06-06,2024-06-06 15:12:13,2024-06-06 15:10:13,On Time,1918.92,UPI,29940,6128,...,19,1592.94,chennai,tamil nadu,,,,,,


---

### csv 파일이 vscode에서 안 열릴 때의 디버깅

In [163]:
# # 생성된 csv가 실제로 생성되고 존재하는지 확인
# import os
# os.path.exists("blinkit_master_data_weather_02.csv")

# # merge할 때 잘못되어 csv파일이 0byte인지 확인
# os.path.getsize("blinkit_master_data_weather_02.csv")

# # 어디서 csv 파일 읽어오는지 확인
# print(os.getcwd())

# !find . -name "blinkit_master_data_weather_02.csv"

# # 데이터프레임이 잘못됐는지 확인
# print(type(blinkit_master_data_weather))
# print(blinkit_master_data_weather is None)

---

***날씨가 못 붙은 행 확인***

In [245]:
blinkit_master_data_weather.columns

Index(['order_id', 'customer_id_x', 'order_date_x', 'promised_delivery_time',
       'actual_delivery_time', 'delivery_status_x', 'order_total',
       'payment_method', 'delivery_partner_id_x', 'store_id', 'product_id',
       'quantity', 'unit_price', 'product_name', 'category', 'brand', 'price',
       'mrp', 'margin_percentage', 'shelf_life_days', 'min_stock_level',
       'max_stock_level', 'delivery_partner_id_y', 'promised_time',
       'actual_time', 'delivery_time_minutes', 'distance_km',
       'delivery_status_y', 'reasons_if_delayed', 'feedback_id',
       'customer_id_y', 'rating', 'feedback_text', 'feedback_category',
       'sentiment', 'feedback_date', 'customer_name', 'email', 'phone',
       'address', 'area', 'pincode', 'registration_date', 'customer_segment',
       'total_orders', 'avg_order_value', 'city_norm(정규화된 도시 이름)',
       'city_std(광역권 이름)', 'order_date_y', 'temp_max(최고 기온)',
       'temp_min(최저 기온)', 'rain_pre(강수량 합계)'],
      dtype='object')

In [243]:
blinkit_master_data_weather[blinkit_master_data_weather["temp_max(최고 기온)"].isna()][
    ["area", "city_std(광역권 이름)", "city_norm(정규화된 도시 이름)"]
]


Unnamed: 0,area,city_std(광역권 이름),city_norm(정규화된 도시 이름)
0,Allahabad,Uttar Pradesh,allahabad
1,Thrissur,Kerala,thrissur
2,Vellore,Tamil Nadu,vellore
3,Gaya,Bihar,gaya
4,Asansol,West Bengal,asansol
...,...,...,...
4995,Udaipur,Rajasthan,udaipur
4996,Mathura,Uttar Pradesh,mathura
4997,Jamshedpur,Jharkhand,jamshedpur
4998,Chennai,Tamil Nadu,chennai
