In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv
/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


In [3]:
df = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktrain.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  8000 non-null   int64  
 1   ID          8000 non-null   int64  
 2   class       8000 non-null   object 
 3   20150720_N  7440 non-null   float64
 4   20150602_N  6800 non-null   float64
 5   20150517_N  7200 non-null   float64
 6   20150501_N  7040 non-null   float64
 7   20150415_N  7520 non-null   float64
 8   20150330_N  6880 non-null   float64
 9   20150314_N  7280 non-null   float64
 10  20150226_N  6640 non-null   float64
 11  20150210_N  7360 non-null   float64
 12  20150125_N  6960 non-null   float64
 13  20150109_N  7120 non-null   float64
 14  20141117_N  6720 non-null   float64
 15  20141101_N  7600 non-null   float64
 16  20141016_N  6560 non-null   float64
 17  20140930_N  7200 non-null   float64
 18  20140813_N  7440 non-null   float64
 19  20140626_N  6400 non-null  

In [4]:
class NDVIPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.date_cols = []
        self.season_map = {}
        self.season_groups = {}
        self.imputers = {}
        self.fitted = False

    def fit(self, X, y=None):
        self.date_cols = [col for col in X.columns if '_N' in col]
        self.date_map = {col: pd.to_datetime(col.split('_')[0], format='%Y%m%d') for col in self.date_cols}
    
        self.renamed_cols = {col: self.date_map[col] for col in self.date_cols}
        self.sorted_dates = sorted(self.date_map.values())
    
        def get_season(date):
            m = date.month
            if m in [12, 1, 2]:
                return 'Winter'
            elif m in [3, 4, 5]:
                return 'Spring'
            elif m in [6, 7, 8]:
                return 'Summer'
            else:
                return 'Fall'
    
        self.season_map = {d: get_season(d) for d in self.sorted_dates}
        self.season_groups = {'Winter': [], 'Spring': [], 'Summer': [], 'Fall': []}
        for date in self.sorted_dates:
            self.season_groups[self.season_map[date]].append(date)
    
        # Fit each imputer with seasonal data
        X_copy = X.copy()
        X_copy = X_copy.rename(columns=self.renamed_cols)
    
        for season, cols in self.season_groups.items():
            self.imputers[season] = SimpleImputer(strategy='median')
            self.imputers[season].fit(X_copy[cols])  # THIS LINE FIXES THE ERROR
    
        self.fitted = True
        return self


    def transform(self, X):
        assert self.fitted, "Call fit before transform."
        X_copy = X.copy()
        X_copy = X_copy.rename(columns=self.renamed_cols)
    
        cols_to_keep = ['ID'] + self.sorted_dates
        if 'class' in X_copy.columns:
            cols_to_keep.insert(1, 'class')  # insert after ID if present
    
        X_copy = X_copy[cols_to_keep]
    
        for season, cols in self.season_groups.items():
            imputer = self.imputers[season]
            X_copy[cols] = imputer.transform(X_copy[cols])  # use transform, not fit_transform
    
        for season, cols in self.season_groups.items():
            X_copy[f'{season}_mean'] = X_copy[cols].mean(axis=1)
            X_copy[f'{season}_std'] = X_copy[cols].std(axis=1)
    
        return X_copy[[f'{s}_mean' for s in self.season_groups] +
                      [f'{s}_std' for s in self.season_groups]]


In [5]:
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

In [6]:
pipeline = Pipeline([
    ('ndvi_preprocessing', NDVIPreprocessor()),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))
])

In [7]:
pipeline.fit(df, df['class'])

In [8]:
test_df = pd.read_csv('/kaggle/input/summer-analytics-mid-hackathon/hacktest.csv')  # Adjust filename if needed
preds = pipeline.predict(test_df)

# Convert predictions back to original class names
predicted_labels = le.inverse_transform(preds)

In [9]:
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Predicted': predicted_labels
})
submission.to_csv('submission.csv', index=False)

In [10]:
subm = pd.read_csv('/kaggle/working/submission.csv')
subm.head()

Unnamed: 0,ID,Predicted
0,1,forest
1,2,forest
2,3,forest
3,4,forest
4,5,forest
