# Data Collection and Merging for Weather and Stocks

This script gathers weather data from our relevant cities and stock prices for our chosen airlines. The collected data is processed and merged to create CSV files that can be used for further analysis. The weather data is sourced using geographic coordinates and stock prices are retrieved via Yahoo Finance. The final output consists of three CSV files containing weather data and stock prices respectively.

In [None]:
import os
import pandas as pd
from meteostat import Point, Daily
from datetime import datetime, timedelta
import yfinance as yf

In [None]:
####### Paths ########
# Path to the "weather" folder
script_dir = os.path.dirname(os.path.abspath(__file__))
weather_dir = os.path.join(script_dir, "weather")
os.makedirs(weather_dir, exist_ok=True)

# Path to the "stocks" folder
stocks_dir = os.path.join(script_dir, "stocks")
os.makedirs(stocks_dir, exist_ok=True)

# Define start and end dates
today = datetime.now().strftime("%d.%m.%Y")
start = datetime.strptime("01.08.2024", "%d.%m.%Y")
start_hol = start.date()
end = datetime.strptime(today, "%d.%m.%Y")
end_hol = end.date()

# Define airports and countries
stocks = {
    "Austrian": "",
    "KLM": "AF.PA",
    "Lufthansa": "LHA.DE",
    "Qatar": ""
}

locations = {
    "Frankfurt": {"latitude": 50.1109, "longitude": 8.6821},
    "Berlin": {"latitude": 52.5200, "longitude": 13.4050},
    "Hamburg": {"latitude": 53.5511, "longitude": 9.9937},
    "Munich": {"latitude": 48.1351, "longitude": 11.5820},
    "London": {"latitude": 51.5074, "longitude": -0.1278},
    "Palma": {"latitude": 39.5696, "longitude": 2.6502},
    "Istanbul": {"latitude": 41.0082, "longitude": 28.9784},
    "Dubai": {"latitude": 25.276987, "longitude": 55.296249},
    "New_York": {"latitude": 40.7128, "longitude": -74.0060},
    "Shanghai": {"latitude": 31.2304, "longitude": 121.4737}
}

In [None]:
######### Weather ##########

# Get coordinates for cities
def get_location_points(locations):
    points = {}
    for city, coords in locations.items():
        points[city] = Point(coords['latitude'], coords['longitude'])
    return points

location_points = get_location_points(locations)

# Dictionary to store weather dataframes
weather_dfs = {}

# Fetch weather data for each city and store in a dictionary
for city, point in location_points.items():
    data = Daily(point, start, end)
    df = data.fetch()
    weather_dfs[city] = df

# Save each city's weather data as a CSV
for city, df in weather_dfs.items():
    file_path = os.path.join(weather_dir, f"{city}_weather.csv")
    df.to_csv(file_path)
    print(f"CSV-Datei für {city} gespeichert: {file_path}")

# List to store the weather DataFrames with city as a column
weather_combined_list = []

for city, df in weather_dfs.items():
    # Add a new 'City' column containing the city name
    df['City'] = city
    # Add to the list
    weather_combined_list.append(df.reset_index())

# Concatenate all weather DataFrames vertically
combined_weather_df = pd.concat(weather_combined_list)

# Sort by date
combined_weather_df.sort_values(by='time', inplace=True)

# Rename the 'time' column to 'Date'
combined_weather_df.rename(columns={'time': 'Date'}, inplace=True)

# Save the combined weather data as a CSV file
combined_weather_file_path = os.path.join(weather_dir, "combined_weather.csv")
combined_weather_df.to_csv(combined_weather_file_path, index=False, encoding='utf-8')

print(f"combined_weather.csv wurde erfolgreich im Ordner 'weather' gespeichert.")

In [None]:
######### Stocks #########

# Retrieve stock data
stock_dfs = {}
for stock, ticker in stocks.items():
    if ticker:
        data = yf.download(ticker, start=start, end=end)
        data.reset_index(inplace=True)
        data['Date'] = pd.to_datetime(data['Date']).dt.date
        stock_dfs[stock] = data

# Save stock DataFrames as CSV files
for stock, df in stock_dfs.items():
    file_path = os.path.join(stocks_dir, f"{stock}_stock_data.csv")
    df.to_csv(file_path)
    print(f"CSV-Datei für {stock} gespeichert: {file_path}")

# List to store stock DataFrames with airline as a column
stocks_combined_list = []

for stock, df in stock_dfs.items():
    # Add a new 'Airline' column containing the airline name
    df['Airline'] = stock
    # Add to the list
    stocks_combined_list.append(df.reset_index(drop=True))

# Concatenate all stock DataFrames vertically
combined_stocks_df = pd.concat(stocks_combined_list)

# Sort by date
combined_stocks_df.sort_values(by='Date', inplace=True)

# Ensure that both Date columns are in the same format
combined_stocks_df['Date'] = pd.to_datetime(combined_stocks_df['Date'])
full_date_df = pd.DataFrame(pd.date_range(start=combined_stocks_df['Date'].min(), end=combined_stocks_df['Date'].max(), freq='D'), columns=['Date'])

# Merge the full date range with the combined stock DataFrame
combined_stocks_df = pd.merge(full_date_df, combined_stocks_df, on='Date', how='left')

# Duplicate Friday stock data for Saturday and Sunday
def duplicate_friday_rows_for_weekend(df):
    new_rows = []
    for i in range(len(df)):
        if df.iloc[i]['Date'].weekday() == 4:  # Friday
            for j in range(1, 3):  # For Saturday and Sunday
                new_row = df.iloc[i].copy()
                new_row['Date'] = df.iloc[i]['Date'] + timedelta(days=j)
                new_rows.append(new_row)
    return pd.concat([df] + [pd.DataFrame(new_rows)], ignore_index=True)

combined_stocks_df = duplicate_friday_rows_for_weekend(combined_stocks_df)

# Sort by date and airline
combined_stocks_df.sort_values(by=['Date', 'Airline'], inplace=True)

# Save the adjusted stock DataFrame as a CSV
combined_stocks_file_path = os.path.join(stocks_dir, "combined_stocks_with_weekends.csv")
combined_stocks_df.to_csv(combined_stocks_file_path, index=False, encoding='utf-8')

print(f"combined_stocks_with_weekends.csv wurde erfolgreich im Ordner 'stocks' gespeichert.")

In [None]:
############## df joins ##############
'''data = {
    'airline_name': ['KLM', 'KLM', 'KLM'],
    'crawling_date': ['01-08-2024', '01-08-2024', '01-08-2024'],
    'departure_airport': ['Frankfurt', 'Frankfurt', 'Frankfurt'],
    'destination_airport': ['Berlin', 'Hamburg', 'Munich'],
    'date': ['15-09-2024', '16-09-2024', '17-09-2024'],
    'travel_duration': ['08:30', '10:45', '07:50'],
    'departure_time': ['10:00', '14:30', '09:15'],
    'arrival_time': ['18:30', '01:15', '17:05'],
    'transit': [False, True, True],
    'transit_duration': ['00:00', '01:30', '02:00'],
    'price': [450.50, 600.75, 550.00]
'''

########## description of weather parameters ############
"""time: Zeitstempel (Datum und Uhrzeit)
tavg: Durchschnittstemperatur
tmin: Minimaltemperatur
tmax: Maximaltemperatur
prcp: Niederschlag
snow: Schneefall
wdir: Windrichtung
wspd: Windgeschwindigkeit
wpgt: Windböen
pres: Luftdruck
tsun: Sonnenscheindauer"""