<a href="https://colab.research.google.com/github/munga21407/data_analysis/blob/main/Data%20science.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

def load_and_clean_data(sales_data_path, weather_data_path, merge_on=None):
    # Load sales data
    sales_data = pd.read_csv(sales_data_path) if sales_data_path.endswith('.csv') else pd.read_json(sales_data_path)

    # Load weather data
    weather_data = pd.read_csv(weather_data_path) if weather_data_path.endswith('.csv') else pd.read_json(weather_data_path)

    # Basic cleaning for sales data
    sales_data.dropna(inplace=True)  # Drop rows with missing values
    sales_data['timestamp'] = pd.to_datetime(sales_data['timestamp'])  # Convert timestamp to datetime if not already

    # Basic cleaning for weather data
    weather_data.dropna(inplace=True)  # Drop rows with missing values
    weather_data['timestamp'] = pd.to_datetime(weather_data['timestamp'])  # Convert timestamp to datetime if not already

    # If merge_on is specified, merge weather data with sales data
    if merge_on:
        merged_data = pd.merge(sales_data, weather_data, on=merge_on)
        return merged_data
    else:
        return sales_data, weather_data  # Return separate dataframes if merge_on is not specified

# Example usage:
sales_path = 'path_to_sales_data.csv'
weather_path = 'path_to_weather_data.csv'
merged_data = load_and_clean_data(sales_path, weather_path, merge_on=['location', 'timestamp'])
print(merged_data.head())


In [None]:
import pandas as pd

def create_features(df):
    # Rolling sales averages
    df['rolling_sales_avg'] = df.groupby('location')['sales'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())

    # Lagged weather features
    df['lagged_rain'] = df.groupby('location')['rain'].shift(2)  # Sales two days after a rainstorm

    # Categorical variables for holidays, seasons, etc.
    # For demonstration purposes, assuming a simple approach for illustration
    df['is_holiday'] = df['timestamp'].dt.date.isin(holiday_dates)
    df['season'] = df['timestamp'].dt.month.map({1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring',
                                                 6: 'Summer', 7: 'Summer', 8: 'Summer', 9: 'Fall', 10: 'Fall',
                                                 11: 'Fall', 12: 'Winter'})

    return df

# Example usage:
cleaned_df = load_and_clean_data(sales_data_path, weather_data_path, merge_on=['location', 'timestamp'])
feature_enriched_df = create_features(cleaned_df)
print(feature_enriched_df.head())


In [None]:
split_data(df, target_col, test_size)

Splits the DataFrame into training and testing sets.
target_col indicates the column you're predicting (e.g., 'units_sold').
test_size determines the proportion held out for validation.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

def train_model(X_train, y_train, model_type):
    if model_type == "linear_regression":
        model = LinearRegression()
    elif model_type == "random_forest":
        model = RandomForestRegressor(n_estimators=100, random_state=42)
    else:
        raise ValueError("Invalid model type. Supported types are 'linear_regression' and 'random_forest'.")

    model.fit(X_train, y_train)
    return model

# Example usage:
# Assuming X_train and y_train are already defined
trained_model = train_model(X_train, y_train, model_type="random_forest")


In [None]:
import numpy as np
import pandas as pd

def forecast(model, X_new, forecast_horizon):
    if forecast_horizon == 'daily':
        forecasts = model.predict(X_new)
    elif forecast_horizon == 'weekly':
        # Assuming X_new contains daily data, we'll aggregate predictions to weekly level
        num_weeks = int(np.ceil(len(X_new) / 7))
        forecasts = []
        for i in range(num_weeks):
            start_index = i * 7
            end_index = min((i + 1) * 7, len(X_new))
            weekly_pred = model.predict(X_new[start_index:end_index])
            forecasts.append(weekly_pred.sum())  # Summing up daily predictions for the week
    else:
        raise ValueError("Invalid forecast horizon. Supported horizons are 'daily' and 'weekly'.")

    return forecasts

# Example usage:
# Assuming trained_model and X_new are already defined
sales_forecasts_daily = forecast(trained_model, X_new, forecast_horizon='daily')
sales_forecasts_weekly = forecast(trained_model, X_new, forecast_horizon='weekly')


In [None]:
def optimize_inventory(forecasts, lead_times, safety_stock_levels, reorder_point_multiplier=2):
    recommended_stock_levels = []

    for forecast, lead_time, safety_stock in zip(forecasts, lead_times, safety_stock_levels):
        # Calculate reorder point based on forecasted sales and lead time
        reorder_point = forecast * lead_time + safety_stock * reorder_point_multiplier
        recommended_stock_levels.append(reorder_point)

    return recommended_stock_levels

# Example usage:
forecasts = [100, 150, 120]  # Example forecasted sales
lead_times = [7, 14, 10]  # Example lead times for restocking
safety_stock_levels = [20, 30, 25]  # Example safety stock levels
recommended_stock_levels = optimize_inventory(forecasts, lead_times, safety_stock_levels)
print(recommended_stock_levels)
