In [2]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.signal import find_peaks
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

class TimeSeriesSegmentation:
    """
    Comprehensive time series segmentation for order forecasting
    """
    
    def __init__(self, data, sku_col='SKU', date_col='date', value_col='actual_orders'):
        """
        Initialize with data in long format:
        - sku_col: column name for SKU identifiers
        - date_col: column name for dates (should be monthly)
        - value_col: column name for order quantities
        """
        self.raw_data = data
        self.sku_col = sku_col
        self.date_col = date_col
        self.value_col = value_col
        
        # Convert long format to wide format for analysis
        self.data = self._prepare_data()
        self.sku_features = {}
        self.segments = {}
        self.segment_characteristics = {}
    
    def _prepare_data(self):
        """Convert long format polars dataframe to wide format pandas dataframe"""
        # Convert date column to datetime if not already
        df = self.raw_data.with_columns([
            pl.col(self.date_col).str.to_datetime().alias(self.date_col)
        ])
        
        # Sort by SKU and date
        df = df.sort([self.sku_col, self.date_col])
        
        # Pivot to wide format
        wide_df = df.pivot(
            index=self.date_col,
            columns=self.sku_col,
            values=self.value_col,
            aggregate_function='sum'
        )
        
        # Convert to pandas for easier time series analysis
        wide_pandas = wide_df.to_pandas()
        wide_pandas = wide_pandas.set_index(self.date_col)
        
        # Ensure monthly frequency
        wide_pandas = wide_pandas.asfreq('MS')
        
        return wide_pandas
        
    def calculate_sku_features(self):
        """Calculate comprehensive features for each SKU"""
        features = {}
        
        for sku in self.data.columns:
            series = self.data[sku].dropna()
            
            if len(series) < 12:  # Need at least 12 months
                continue
                
            # Basic statistics
            mean_volume = series.mean()
            std_volume = series.std()
            cv = std_volume / mean_volume if mean_volume > 0 else np.inf
            
            # Trend analysis
            x = np.arange(len(series))
            slope, intercept, r_value, p_value, std_err = stats.linregress(x, series)
            trend_strength = abs(r_value)
            trend_direction = 1 if slope > 0 else -1 if slope < 0 else 0
            
            # Seasonality detection
            seasonality_strength = self._detect_seasonality(series)
            
            # Growth/decline patterns
            growth_rate = self._calculate_growth_rate(series)
            volatility = self._calculate_volatility(series)
            
            # Volume characteristics
            volume_category = self._categorize_volume(mean_volume)
            
            # Lifecycle stage
            lifecycle_stage = self._detect_lifecycle_stage(series)
            
            # Structural breaks
            break_points = self._detect_structural_breaks(series)
            
            features[sku] = {
                'mean_volume': mean_volume,
                'cv': cv,
                'trend_strength': trend_strength,
                'trend_direction': trend_direction,
                'seasonality_strength': seasonality_strength,
                'growth_rate': growth_rate,
                'volatility': volatility,
                'volume_category': volume_category,
                'lifecycle_stage': lifecycle_stage,
                'break_points': len(break_points),
                'series_length': len(series)
            }
        
        self.sku_features = features
        return features
    
    def _detect_seasonality(self, series):
        """Detect seasonal patterns using autocorrelation"""
        if len(series) < 24:
            return 0
        
        # Calculate autocorrelation at lag 12 (annual seasonality)
        autocorr_12 = series.autocorr(lag=12)
        
        # Also check for other seasonal patterns
        seasonal_lags = [12]  # Quarterly, semi-annual, annual
        max_autocorr = 0
        
        for lag in seasonal_lags:
            if len(series) > lag * 2:
                autocorr = abs(series.autocorr(lag=lag))
                if not np.isnan(autocorr):
                    max_autocorr = max(max_autocorr, autocorr)
        
        return max_autocorr
    
    def _calculate_growth_rate(self, series):
        """Calculate compound annual growth rate"""
        if len(series) < 12:
            return 0
        
        start_val = series.iloc[:6].mean()  # First 6 months average
        end_val = series.iloc[-6:].mean()   # Last 6 months average
        
        if start_val <= 0:
            return 0
        
        periods = len(series) / 12  # Years
        growth_rate = (end_val / start_val) ** (1/periods) - 1
        return growth_rate
    
    def _calculate_volatility(self, series):
        """Calculate volatility as rolling CV"""
        if len(series) < 6:
            return 0
        
        rolling_mean = series.rolling(window=6, min_periods=3).mean()
        rolling_std = series.rolling(window=6, min_periods=3).std()
        rolling_cv = rolling_std / rolling_mean
        
        return rolling_cv.mean()
    
    def _categorize_volume(self, mean_volume):
        """Categorize volume into high/medium/low"""
        # This should be adapted based on your specific volume distribution
        if mean_volume < 100:
            return 'low'
        elif mean_volume < 1000:
            return 'medium'
        else:
            return 'high'
    
    def _detect_lifecycle_stage(self, series):
        """Detect product lifecycle stage"""
        if len(series) < 12:
            return 'unknown'
        
        # Split into thirds
        third = len(series) // 3
        first_third = series.iloc[:third].mean()
        second_third = series.iloc[third:2*third].mean()
        last_third = series.iloc[2*third:].mean()
        
        # Growth patterns
        if first_third < second_third < last_third:
            return 'growth'
        elif first_third > second_third > last_third:
            return 'decline'
        elif first_third < second_third and second_third > last_third:
            return 'maturity'
        else:
            return 'stable'
    
    def _detect_structural_breaks(self, series):
        """Simple structural break detection using rolling statistics"""
        if len(series) < 12:
            return []
        
        # Calculate rolling mean and std
        window = 6
        rolling_mean = series.rolling(window=window).mean()
        rolling_std = series.rolling(window=window).std()
        
        # Find significant changes
        mean_changes = rolling_mean.diff().abs()
        std_changes = rolling_std.diff().abs()
        
        # Identify breaks (simplified approach)
        mean_threshold = mean_changes.quantile(0.8)
        std_threshold = std_changes.quantile(0.8)
        
        breaks = []
        for i in range(window, len(series)):
            if (mean_changes.iloc[i] > mean_threshold or 
                std_changes.iloc[i] > std_threshold):
                breaks.append(i)
        
        return breaks
    
    def segment_skus(self, method='combined'):
        """
        Segment SKUs based on characteristics
        Methods: 'statistical', 'business_logic', 'combined'
        """
        if not self.sku_features:
            self.calculate_sku_features()
        
        if method == 'statistical':
            return self._statistical_segmentation()
        elif method == 'business_logic':
            return self._business_logic_segmentation()
        else:
            return self._combined_segmentation()
    
    def _statistical_segmentation(self):
        """Cluster SKUs based on statistical features"""
        # Prepare feature matrix
        feature_names = ['cv', 'trend_strength', 'seasonality_strength', 
                        'growth_rate', 'volatility', 'break_points']
        
        X = []
        skus = []
        
        for sku, features in self.sku_features.items():
            row = [features.get(f, 0) for f in feature_names]
            
            # Check for valid values (no inf or nan)
            if all(np.isfinite(val) for val in row):
                X.append(row)
                skus.append(sku)
        
        if len(X) < 2:
            # Not enough valid data for clustering
            segments = {}
            for sku in self.sku_features.keys():
                segments[sku] = 'single_cluster'
            self.segments = segments
            return segments
        
        X = np.array(X)
        
        # Standardize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Find optimal number of clusters
        best_k = self._find_optimal_clusters(X_scaled)
        
        # Perform clustering
        kmeans = KMeans(n_clusters=best_k, random_state=42)
        clusters = kmeans.fit_predict(X_scaled)
        
        # Assign segments
        segments = {}
        for i, sku in enumerate(skus):
            segments[sku] = f'statistical_cluster_{clusters[i]}'
        
        # Handle SKUs that were excluded from clustering
        for sku in self.sku_features.keys():
            if sku not in segments:
                segments[sku] = 'outlier_cluster'
        
        self.segments = segments
        return segments
    
    def _business_logic_segmentation(self):
        """Segment based on business logic"""
        segments = {}
        
        for sku, features in self.sku_features.items():
            # Get features with default values for missing/invalid data
            volume_cat = features.get('volume_category', 'unknown')
            lifecycle = features.get('lifecycle_stage', 'unknown')
            seasonality = features.get('seasonality_strength', 0)
            volatility = features.get('volatility', 0)
            growth_rate = features.get('growth_rate', 0)
            
            # Handle infinite or NaN values
            if not np.isfinite(seasonality):
                seasonality = 0
            if not np.isfinite(volatility):
                volatility = 0
            if not np.isfinite(growth_rate):
                growth_rate = 0
            
            # Define segment based on characteristics
            if seasonality > 0.3:
                segment = f'seasonal_{volume_cat}'
            elif growth_rate > 0.2:
                segment = f'growth_{volume_cat}'
            elif growth_rate < -0.2:
                segment = f'decline_{volume_cat}'
            elif volatility > 0.5:
                segment = f'volatile_{volume_cat}'
            else:
                segment = f'stable_{volume_cat}'
            
            segments[sku] = segment
        
        self.segments = segments
        return segments
    
    def _combined_segmentation(self):
        """Combine statistical and business logic approaches"""
        # Start with business logic
        business_segments = self._business_logic_segmentation()
        
        # Refine with statistical clustering within business segments
        refined_segments = {}
        
        for segment_name in set(business_segments.values()):
            skus_in_segment = [sku for sku, seg in business_segments.items() 
                             if seg == segment_name]
            
            if len(skus_in_segment) > 3:  # Only cluster if enough SKUs
                # Extract features for this segment
                X = []
                valid_skus = []
                
                for sku in skus_in_segment:
                    if sku in self.sku_features:
                        features = self.sku_features[sku]
                        row = [features['cv'], features['trend_strength'], 
                              features['volatility']]
                        
                        # Check for valid values (no inf or nan)
                        if all(np.isfinite(val) for val in row):
                            X.append(row)
                            valid_skus.append(sku)
                
                if len(valid_skus) > 1:
                    X = np.array(X)
                    
                    # Perform sub-clustering
                    scaler = StandardScaler()
                    X_scaled = scaler.fit_transform(X)
                    
                    n_clusters = min(3, len(valid_skus))
                    if n_clusters > 1:
                        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
                        sub_clusters = kmeans.fit_predict(X_scaled)
                        
                        # Assign refined segments to valid SKUs
                        for i, sku in enumerate(valid_skus):
                            refined_segments[sku] = f'{segment_name}_sub{sub_clusters[i]}'
                    else:
                        # If only one cluster, keep original segment name
                        for sku in valid_skus:
                            refined_segments[sku] = segment_name
                    
                    # Assign original segment name to invalid SKUs
                    for sku in skus_in_segment:
                        if sku not in valid_skus:
                            refined_segments[sku] = segment_name
                else:
                    # Not enough valid SKUs for clustering
                    for sku in skus_in_segment:
                        refined_segments[sku] = segment_name
            else:
                # Not enough SKUs in segment for sub-clustering
                for sku in skus_in_segment:
                    refined_segments[sku] = segment_name
        
        self.segments = refined_segments
        return refined_segments
    
    def _find_optimal_clusters(self, X):
        """Find optimal number of clusters using silhouette score"""
        silhouette_scores = []
        K_range = range(2, min(10, len(X)))
        
        for k in K_range:
            kmeans = KMeans(n_clusters=k, random_state=42)
            labels = kmeans.fit_predict(X)
            score = silhouette_score(X, labels)
            silhouette_scores.append(score)
        
        return K_range[np.argmax(silhouette_scores)]
    
    def analyze_segments(self):
        """Analyze characteristics of each segment"""
        if not self.segments:
            print("No segments found. Run segment_skus() first.")
            return
        
        segment_analysis = {}
        
        for segment_name in set(self.segments.values()):
            skus_in_segment = [sku for sku, seg in self.segments.items() 
                             if seg == segment_name]
            
            # Calculate segment characteristics
            characteristics = {
                'sku_count': len(skus_in_segment),
                'avg_volume': np.mean([self.sku_features[sku]['mean_volume'] 
                                     for sku in skus_in_segment]),
                'avg_cv': np.mean([self.sku_features[sku]['cv'] 
                                 for sku in skus_in_segment]),
                'avg_seasonality': np.mean([self.sku_features[sku]['seasonality_strength'] 
                                          for sku in skus_in_segment]),
                'avg_growth_rate': np.mean([self.sku_features[sku]['growth_rate'] 
                                          for sku in skus_in_segment]),
                'skus': skus_in_segment
            }
            
            segment_analysis[segment_name] = characteristics
        
        self.segment_characteristics = segment_analysis
        return segment_analysis
    
    def recommend_forecasting_models(self):
        """Recommend appropriate forecasting models for each segment"""
        if not self.segment_characteristics:
            self.analyze_segments()
        
        recommendations = {}
        
        for segment, chars in self.segment_characteristics.items():
            models = []
            
            # Based on segment characteristics
            if chars['avg_seasonality'] > 0.3:
                models.extend(['Prophet', 'SARIMA', 'Holt-Winters'])
            
            if chars['avg_cv'] > 0.8:  # High variability
                models.extend(['Prophet', 'LSTM', 'Random Forest'])
            
            if chars['avg_growth_rate'] > 0.2:  # Strong growth
                models.extend(['Prophet', 'Linear Regression with Trend'])
            
            if chars['sku_count'] < 5:  # Small segment
                models.extend(['Simple Exponential Smoothing', 'Linear Regression'])
            
            # Default models
            if not models:
                models = ['ARIMA', 'Simple Exponential Smoothing']
            
            recommendations[segment] = list(set(models))
        
        return recommendations
    
    def plot_segments(self, figsize=(15, 10)):
        """Visualize segments and their characteristics"""
        if not self.segments:
            print("No segments found. Run segment_skus() first.")
            return
        
        fig, axes = plt.subplots(2, 2, figsize=figsize)
        
        # Prepare data for plotting
        segment_data = []
        for sku, segment in self.segments.items():
            features = self.sku_features[sku]
            segment_data.append({
                'sku': sku,
                'segment': segment,
                'volume': features['mean_volume'],
                'cv': features['cv'],
                'seasonality': features['seasonality_strength'],
                'growth_rate': features['growth_rate']
            })
        
        df = pd.DataFrame(segment_data)
        
        # Plot 1: Volume vs CV by segment
        sns.scatterplot(data=df, x='volume', y='cv', hue='segment', ax=axes[0,0])
        axes[0,0].set_title('Volume vs Coefficient of Variation')
        axes[0,0].set_xscale('log')
        
        # Plot 2: Seasonality vs Growth Rate by segment
        sns.scatterplot(data=df, x='seasonality', y='growth_rate', hue='segment', ax=axes[0,1])
        axes[0,1].set_title('Seasonality vs Growth Rate')
        
        # Plot 3: Segment distribution
        segment_counts = df['segment'].value_counts()
        axes[1,0].bar(range(len(segment_counts)), segment_counts.values)
        axes[1,0].set_xticks(range(len(segment_counts)))
        axes[1,0].set_xticklabels(segment_counts.index, rotation=45)
        axes[1,0].set_title('SKUs per Segment')
        
        # Plot 4: Average characteristics by segment
        segment_summary = df.groupby('segment')[['volume', 'cv', 'seasonality', 'growth_rate']].mean()
        segment_summary.plot(kind='bar', ax=axes[1,1])
        axes[1,1].set_title('Average Characteristics by Segment')
        axes[1,1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        
        plt.tight_layout()
        plt.show()

# Example usage
'''
def create_sample_data():
    """Create sample data in long format for demonstration"""
    np.random.seed(42)
    dates = pd.date_range('2022-01-01', periods=36, freq='MS')
    
    # Create different types of SKUs
    skus_data = []
    
    # Seasonal SKU
    seasonal_pattern = 50 + 20 * np.sin(2 * np.pi * np.arange(36) / 12)
    seasonal_orders = seasonal_pattern + np.random.normal(0, 5, 36)
    
    # Growth SKU
    growth_trend = 100 + 5 * np.arange(36)
    growth_orders = growth_trend + np.random.normal(0, 10, 36)
    
    # Volatile SKU
    volatile_base = 200 + np.random.normal(0, 50, 36)
    volatile_orders = np.abs(volatile_base)
    
    # Stable SKU
    stable_orders = 150 + np.random.normal(0, 8, 36)
    
    # Decline SKU
    decline_trend = 300 - 5 * np.arange(36)
    decline_orders = decline_trend + np.random.normal(0, 15, 36)
    
    # Create long format data
    all_data = []
    
    sku_patterns = {
        'SKU001': seasonal_orders,
        'SKU002': growth_orders,
        'SKU003': volatile_orders,
        'SKU004': stable_orders,
        'SKU005': decline_orders
    }
    
    for sku, orders in sku_patterns.items():
        for i, date in enumerate(dates):
            all_data.append({
                'SKU': sku,
                'date': date.strftime('%Y-%m-%d'),
                'actual_orders': max(0, orders[i])  # Ensure non-negative
            })
    
    return pl.DataFrame(all_data)

# Demo
if __name__ == "__main__":
    # Create sample data in long format
    sample_data = create_sample_data()
    print("Sample data structure:")
    print(sample_data.head())
    
    # Initialize segmentation
    segmenter = TimeSeriesSegmentation(sample_data)
    
    # Calculate features
    features = segmenter.calculate_sku_features()
    print("\nSKU Features:")
    for sku, feat in features.items():
        print(f"{sku}: CV={feat['cv']:.2f}, Seasonality={feat['seasonality_strength']:.2f}, Growth={feat['growth_rate']:.2f}")
    
    # Segment SKUs
    segments = segmenter.segment_skus(method='combined')
    print("\nSegments:")
    for sku, segment in segments.items():
        print(f"{sku}: {segment}")
    
    # Analyze segments
    analysis = segmenter.analyze_segments()
    print("\nSegment Analysis:")
    for segment, chars in analysis.items():
        print(f"{segment}: {chars['sku_count']} SKUs, Avg Volume: {chars['avg_volume']:.0f}")
    
    # Get model recommendations
    recommendations = segmenter.recommend_forecasting_models()
    print("\nModel Recommendations:")
    for segment, models in recommendations.items():
        print(f"{segment}: {models}")
    
    # Plot segments
    segmenter.plot_segments()'''

'\ndef create_sample_data():\n    """Create sample data in long format for demonstration"""\n    np.random.seed(42)\n    dates = pd.date_range(\'2022-01-01\', periods=36, freq=\'MS\')\n\n    # Create different types of SKUs\n    skus_data = []\n\n    # Seasonal SKU\n    seasonal_pattern = 50 + 20 * np.sin(2 * np.pi * np.arange(36) / 12)\n    seasonal_orders = seasonal_pattern + np.random.normal(0, 5, 36)\n\n    # Growth SKU\n    growth_trend = 100 + 5 * np.arange(36)\n    growth_orders = growth_trend + np.random.normal(0, 10, 36)\n\n    # Volatile SKU\n    volatile_base = 200 + np.random.normal(0, 50, 36)\n    volatile_orders = np.abs(volatile_base)\n\n    # Stable SKU\n    stable_orders = 150 + np.random.normal(0, 8, 36)\n\n    # Decline SKU\n    decline_trend = 300 - 5 * np.arange(36)\n    decline_orders = decline_trend + np.random.normal(0, 15, 36)\n\n    # Create long format data\n    all_data = []\n\n    sku_patterns = {\n        \'SKU001\': seasonal_orders,\n        \'SKU002\': g

In [3]:
from datetime import datetime
from dateutil.relativedelta import relativedelta
idf = pl.read_parquet("C:\\Users\\smishra14\\setup\\repos\\fcst\\data\\Trauma.parquet")
df= idf.filter(pl.col('SALES_DATE')<=datetime.today()-relativedelta(months=1))

# Initialize segmentation (adjust column names as needed)
segmenter = TimeSeriesSegmentation(
    data=df.with_columns(pl.col('SALES_DATE').cast(pl.Utf8),pl.col('`Act Orders Rev').cast(pl.Float32)),
    sku_col='CatalogNumber',     # your SKU column name
    date_col='SALES_DATE',          # your date column name  
    value_col='`Act Orders Rev' # your orders column name
)

# Rest of the workflow remains the same
segments = segmenter.segment_skus(method='combined')
analysis = segmenter.analyze_segments()
recommendations = segmenter.recommend_forecasting_models()

In [4]:
pl.from_dict(segments).transpose(include_header=True,header_name='CatalogNumber',column_names=["segment"]).write_csv("segments.csv")
ff=pl.DataFrame()
for i in analysis.keys():
    dff=pl.from_dict(analysis[i])
    dff=dff.with_columns(segment=pl.lit(i))
    ff=pl.concat([ff,dff],how='diagonal_relaxed')
ff.write_csv("analysis.csv")

In [2]:
import polars as pl
df=pl.read_parquet(r"C:\Users\smishra14\setup\repos\fcst\data\Trauma.parquet")
df.describe()


statistic,Selling Division,Area,Stryker Group Region,Region,Country,CatalogNumber,Business Sector,Business Unit,Franchise,Product Line,IBP Level 5,IBP Level 6,IBP Level 7,SALES_DATE,UOM,Pack Content,`L0 ASP Final Rev,`Act Orders Rev,Act Orders Rev Val,L2 DF Final Rev,L1 DF Final Rev,L0 DF Final Rev,L2 Stat Final Rev,`Fcst DF Final Rev,`Fcst Stat Final Rev,`Fcst Stat Prelim Rev,Fcst DF Final Rev Val
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""734160""","""734160""","""734160""","""734160""","""734160""","""734160""","""734160""","""734160""","""734160""","""734160""","""734160""","""734160""","""734160""","""734160""",734160.0,734160.0,594965.0,205935.0,205137.0,467487.0,564335.0,565647.0,458528.0,305900.0,251045.0,242777.0,734160.0
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,0.0,139195.0,528225.0,529023.0,266673.0,169825.0,168513.0,275632.0,428260.0,483115.0,491383.0,0.0
"""mean""",,,,,,,,,,,,,,"""2025-02-14 12:47:59.999999""",0.988121,1.077803,969.030499,36.945133,13009.980796,30.314416,25.154556,25.005851,29.740354,22.258892,27.22214,27.55798,3525.598852
"""std""",,,,,,,,,,,,,,,0.097875,1.095921,1773.848025,191.638396,65359.091245,182.161462,166.960894,167.7697,177.159653,167.118273,181.95925,180.254423,40291.481297
"""min""","""CMF""","""United States""","""UNITED STATES""","""United States""","""UNITED STATES""","""0011201""","""Orthopaedics and Spine""","""Trauma""","""Trauma and Extremities""","""External Fixation""","""4FUSION""","""115 TR""","""0.3 SPLIT FREEZE-DRIED TRAD ST…","""2022-09-01 00:00:00""",0.01,1.0,-24656.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3206.3
"""25%""",,,,,,,,,,,,,,"""2023-12-01 00:00:00""",1.0,1.0,146.53509,1.0,407.41,0.296415,0.0,0.0,0.312807,0.0,0.0,0.0,0.0
"""50%""",,,,,,,,,,,,,,"""2025-03-01 00:00:00""",1.0,1.0,401.0,4.0,1981.0,2.345941,1.219896,1.078099,2.379283,0.090928,1.0,1.0,0.0
"""75%""",,,,,,,,,,,,,,"""2026-05-01 00:00:00""",1.0,1.0,1467.0,16.0,7695.48,10.358045,7.320674,7.0,10.276866,4.484278,7.105101,7.590663,0.0
"""max""","""SUSTAINABILITY""","""United States""","""UNITED STATES""","""United States""","""UNITED STATES""","""XSEXF010101""","""Orthopaedics and Spine""","""Trauma""","""Trauma and Extremities""","""Trauma Biologics""","""XPRESS DR AND UDR""","""XPRESS INSTRUMENTS- Obs""","""XPRESS DR AND UDR INSTRUMENT K…","""2027-08-01 00:00:00""",1.0,100.0,75855.6068,6731.0,4115100.0,8538.24313,8538.24313,9135.920149,7222.901113,9135.920149,7389.012162,7389.012162,5548600.0


In [None]:
df=pl.read_parquet(r"C:\Users\smishra14\setup\repos\fcst\data\Sterishield.parquet")
df=df.with_columns(cluster=pl.col('cluster').cast(pl.Utf8))


In [None]:
df=df.sort('SALES_DATE').with_columns(cluster=pl.col("cluster").forward_fill().backward_fill().over("unique_id")).describe()


In [25]:
df.write_parquet(r"C:\Users\smishra14\setup\repos\fcst\data\Sterishield.parquet")