pd.set_option("display.max_rows", None) # 행이 많아도 자르지 않음

In [21]:
import pandas as pd

# -------------------------------------------------------------------
df = pd.read_csv("../blinkit-dataset/blinkit_master_data.csv")

df["area"].value_counts().reset_index()

Unnamed: 0,area,count
0,Orai,44
1,Deoghar,40
2,Gandhinagar,37
3,Nandyal,36
4,Ratlam,35
...,...,...
311,Cuttack,4
312,Rourkela,4
313,Raichur,3
314,Bokaro,2


In [22]:
df["area"].unique()

array(['Allahabad', 'Thrissur', 'Vellore', 'Gaya', 'Asansol', 'Gopalpur',
       'Phagwara', 'South Dumdum', 'Kamarhati', 'Sirsa', 'Rohtak',
       'Madurai', 'Vijayawada', 'Belgaum', 'Farrukhabad', 'Nizamabad',
       'Serampore', 'Alappuzha', 'Dehri', 'Khandwa', 'Hosur', 'Ongole',
       'Jalna', 'Khammam', 'Rewa', 'Adoni', 'Muzaffarpur', 'Narasaraopet',
       'Hindupur', 'North Dumdum', 'Saharanpur', 'Davanagere',
       'Dibrugarh', 'Ratlam', 'Maheshtala', 'Chittoor', 'Baranagar',
       'Etawah', 'Bathinda', 'Rajahmundry', 'Medininagar', 'Nandyal',
       'Akola', 'Avadi', 'Bokaro', 'Chennai', 'Anand', 'Pali', 'Mathura',
       'Kakinada', 'Begusarai', 'Gurgaon', 'Ahmednagar', 'Tinsukia',
       'Panvel', 'Siwan', 'Gulbarga', 'Bijapur', 'Deoghar', 'Howrah',
       'Guntur', 'Guwahati', 'Mirzapur', 'Madhyamgram', 'Ichalkaranji',
       'Gudivada', 'Darbhanga', 'Tezpur', 'Bhilwara', 'Berhampur',
       'Kolhapur', 'Karimnagar', 'Morbi', 'Bikaner', 'Rampur', 'Bally',
       'Orai', 

***현재 city 값들부터 전체 점검***

In [23]:
df["area"].value_counts().reset_index().rename(
    columns={"area": "raw_city"}
)


Unnamed: 0,raw_city,count
0,Orai,44
1,Deoghar,40
2,Gandhinagar,37
3,Nandyal,36
4,Ratlam,35
...,...,...
311,Cuttack,4
312,Rourkela,4
313,Raichur,3
314,Bokaro,2


***기본 정규화 작업(소문자, 공백 제거)***

In [24]:
df["city_norm"] = (
    df["area"]
    .str.lower()
    .str.strip()
)

df["area"]

0        Allahabad
1         Thrissur
2          Vellore
3             Gaya
4          Asansol
           ...    
4995       Udaipur
4996       Mathura
4997    Jamshedpur
4998       Chennai
4999      Jamnagar
Name: area, Length: 5000, dtype: object

***특수 문자, 불필요한 문자 제거***

In [26]:

import re

df["city_norm"] = (
    df["city_norm"]
    .str.replace(r"[^\w\s]", "", regex=True)
)

***city 표준 매핑 딕셔너리 만들기(광역권 기준)***

In [27]:
city_map = {
    # Delhi NCR
    "delhi": "delhi",
    "new delhi": "delhi",
    "gurgaon": "delhi",
    "faridabad": "delhi",
    "noida": "delhi",
    "ghaziabad": "delhi",

    # Mumbai Metro
    "mumbai": "mumbai",
    "navi mumbai": "mumbai",
    "thane": "mumbai",
    "kalyan-dombivli": "mumbai",
    "vasai-virar": "mumbai",
    "bhiwandi": "mumbai",

    # Bangalore Metro
    "bangalore": "bangalore",
    "bengaluru": "bangalore",

    # Hyderabad Metro
    "hyderabad": "hyderabad",
    "secunderabad": "hyderabad",

    # Chennai Metro
    "chennai": "chennai",
    "avadi": "chennai",
    "ambattur": "chennai",
}


# 맵핑 안 된 도시들은 그대로 두기
df["city_std"] = df["city_norm"].replace(city_map)

***데이터 원본과 표준 비교***

In [28]:
city_std_df = df[["area", "city_norm", "city_std"]].drop_duplicates()

city_std_df.to_csv('city_map.csv', index=False, encoding='utf-8-sig')


---

***위도/경도 맵핑하기***

In [29]:
# 광역권 기준, 날씨 조회용 기준점
## Dataframe으로 한 이유
## 1. JOIN 구조 통일
## 2. CSV로 빼기 쉬움
## 3. 나중에 city 늘어나도 관리 편함

city_latlon = pd.DataFrame([
    {"city_std": "delhi", "lat": 28.6139, "lon": 77.2090},
    {"city_std": "mumbai", "lat": 19.0760, "lon": 72.8777},
    {"city_std": "bangalore", "lat": 12.9716, "lon": 77.5946},
    {"city_std": "hyderabad", "lat": 17.3850, "lon": 78.4867},
    {"city_std": "chennai", "lat": 13.0827, "lon": 80.2707},
])

# city_std_df에 붙이기
city_std_df = city_std_df.merge(city_latlon, on="city_std", how="left")

# 확인하기
## 여기서 NaN 뜨는 city가 있다면 → 광역권 매핑에서 빠진 도시라는 뜻 (정상적인 체크 포인트)
city_std_df[["city_std", "lat", "lon"]].drop_duplicates()


Unnamed: 0,city_std,lat,lon
0,allahabad,,
1,thrissur,,
2,vellore,,
3,gaya,,
4,asansol,,
...,...,...,...
311,sonipat,,
312,raichur,,
313,tirupati,,
314,moradabad,,


---

***Open-Meteo로 날씨 데이터 수집 코드***

In [31]:
# 단일 도시, 기간 날씨 수집

import requests

def fetch_weather(lat, lon, start_date, end_date):
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date,
        "daily": [
            "temperature_2m_max",
            "temperature_2m_min",
            "precipitation_sum"
        ],
        "timezone": "Asia/Kolkata"
    }

    r = requests.get(url, params=params)
    r.raise_for_status()
    data = r.json()

    weather_df = pd.DataFrame({
        "date": data["daily"]["time"],
        "temp_max": data["daily"]["temperature_2m_max"],
        "temp_min": data["daily"]["temperature_2m_min"],
        "rain_sum": data["daily"]["precipitation_sum"],
    })

    return weather_df


***모든 city_std에 대해 날씨 수집***

In [33]:
weather_all = []

for _, row in city_latlon.iterrows():
    city = row["city_std"]
    lat = row["lat"]
    lon = row["lon"]

    w = fetch_weather(
        lat=lat,
        lon=lon,
        start_date="2023-03-01",
        end_date="2024-11-30"
    )

    w["city_std"] = city
    weather_all.append(w)

weather_df = pd.concat(weather_all, ignore_index=True)


***결과 확인하기***

In [55]:
weather_df

Unnamed: 0,date,temp_max,temp_min,rain_sum,city_std,temp_range,heatwave,heavy_rain,order_date
0,2023-03-01,30.0,16.3,0.0,delhi,13.7,0,0,2023-03-01
1,2023-03-02,30.3,15.9,0.0,delhi,14.4,0,0,2023-03-02
2,2023-03-03,31.1,16.8,0.0,delhi,14.3,0,0,2023-03-03
3,2023-03-04,30.4,17.3,0.0,delhi,13.1,0,0,2023-03-04
4,2023-03-05,30.2,15.8,0.0,delhi,14.4,0,0,2023-03-05
...,...,...,...,...,...,...,...,...,...
3200,2024-11-26,26.8,23.6,22.5,chennai,3.2,0,0,2024-11-26
3201,2024-11-27,27.6,24.4,5.4,chennai,3.2,0,0,2024-11-27
3202,2024-11-28,26.4,23.8,21.0,chennai,2.6,0,0,2024-11-28
3203,2024-11-29,25.0,23.2,25.6,chennai,1.8,0,0,2024-11-29


***기본 파생 변수***

In [59]:
weather_df["temp_range"] = weather_df["temp_max"] - weather_df["temp_min"]
weather_df["date"] = pd.to_datetime(weather_df["order_date"]).dt.date
df["date"] = pd.to_datetime(df["order_date"]).dt.date


***폭염/폭우 플래그***

In [39]:
weather_df["heatwave"] = (weather_df["temp_max"] >= 38).astype(int)
weather_df["heavy_rain"] = (weather_df["rain_sum"] >= 50).astype(int)

***데이터셋 Join***

In [61]:
blinkit_master_data_weather = df.merge(
    weather_df,
    on=["city_std", "date"],
    how="left"
)

blinkit_master_data_weather

Unnamed: 0,order_id,customer_id_x,order_date_x,promised_delivery_time,actual_delivery_time,delivery_status_x,order_total,payment_method,delivery_partner_id_x,store_id,...,city_norm,city_std,date,temp_max,temp_min,rain_sum,temp_range,heatwave,heavy_rain,order_date_y
0,1961864118,30065862,2024-07-17,2024-07-17 08:52:01,2024-07-17 08:47:01,On Time,3197.07,Cash,63230,4771,...,allahabad,allahabad,2024-07-17,,,,,,,
1,1549769649,9573071,2024-05-28,2024-05-28 13:25:29,2024-05-28 13:27:29,On Time,976.55,Cash,14983,7534,...,thrissur,thrissur,2024-05-28,,,,,,,
2,9185164487,45477575,2024-09-23,2024-09-23 13:25:12,2024-09-23 13:29:12,On Time,839.05,UPI,39859,9886,...,vellore,vellore,2024-09-23,,,,,,,
3,9644738826,88067569,2023-11-24,2023-11-24 16:34:56,2023-11-24 16:33:56,On Time,440.23,Card,61497,7917,...,gaya,gaya,2023-11-24,,,,,,,
4,5427684290,83298567,2023-11-20,2023-11-20 05:17:39,2023-11-20 05:18:39,On Time,2526.68,Cash,84315,2741,...,asansol,asansol,2023-11-20,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1669690997,62600289,2023-12-25,2023-12-25 16:05:20,2023-12-25 16:10:20,On Time,1132.33,Cash,90914,1587,...,udaipur,udaipur,2023-12-25,,,,,,,
4996,8340761903,53640286,2023-11-27,2023-11-27 09:38:43,2023-11-27 09:36:43,On Time,2372.01,Cash,27952,3458,...,mathura,mathura,2023-11-27,,,,,,,
4997,5936301790,87059497,2024-06-21,2024-06-21 19:23:09,2024-06-21 19:26:09,On Time,3158.35,Cash,9590,7424,...,jamshedpur,jamshedpur,2024-06-21,,,,,,,
4998,5710579377,67310893,2024-06-06,2024-06-06 15:12:13,2024-06-06 15:10:13,On Time,1918.92,UPI,29940,6128,...,chennai,chennai,2024-06-06,32.2,26.1,28.1,6.1,0.0,0.0,2024-06-06


---

### 날씨 정보 join 검증

***날씨 컬럼 NaN 체크***

In [62]:
blinkit_master_data_weather[["temp_max", "rain_sum"]].isnull().mean()


temp_max    0.947
rain_sum    0.947
dtype: float64

***날씨가 못 붙은 행 확인***

In [64]:
blinkit_master_data_weather[blinkit_master_data_weather["temp_max"].isna()][
    ["area", "city_std", "date"]
]


Unnamed: 0,area,city_std,date
0,Allahabad,allahabad,2024-07-17
1,Thrissur,thrissur,2024-05-28
2,Vellore,vellore,2024-09-23
3,Gaya,gaya,2023-11-24
4,Asansol,asansol,2023-11-20
...,...,...,...
4994,Mysore,mysore,2023-06-08
4995,Udaipur,udaipur,2023-12-25
4996,Mathura,mathura,2023-11-27
4997,Jamshedpur,jamshedpur,2024-06-21
