## Tsfresh data preparation


In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import tsfresh as tsf
from tsfresh.feature_extraction import extract_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Load the data
invoice = pd.read_csv('data/raw/invoice_updated.csv')
client = pd.read_csv('data/raw/client_train.csv')

In [3]:
#merge the data
merged_df = pd.merge(invoice, client, on='client_id', how='outer')

In [4]:
#drop the columns
merged_df.drop(columns=['year', 'month', 'invoice_year', 'months_number', 'creation_date', 'counter_number', 'counter_code'], inplace=True)

In [5]:
# Convert invoice_date to datetime
merged_df['invoice_date'] = pd.to_datetime(merged_df['invoice_date'], format='%Y-%m-%d')

In [6]:
#remove the data before 2005
merged_df = merged_df[merged_df['invoice_date'] >= '2005-01-01']

In [7]:
#defining the target variable and features
target = ['target']
num_features = ['client_id', 'invoice_date', 'consumption_level_1', 'consumption_level_2', 'consumption_level_3', 'consumption_level_4']
cat_features = [col for col in merged_df.columns if col not in num_features and col not in target]

In [8]:
#preprocessing categorical features
cat_pipeline = Pipeline([
    ('simple_imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
    ('one_hot_encoding', OneHotEncoder())
])

In [9]:
class TsfreshFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # No fitting needed, only transforming
    
    def transform(self, X):
        # Apply tsfresh feature extraction
        # Assume `client_id` as time series ID and `timestamp` as sorting column
        X_tsfresh = extract_features(X, column_id='client_id', column_sort='invoice_date')
        X_tsfresh = impute(X_tsfresh)  # Handle missing values
        return X_tsfresh

In [10]:
num_pipeline = Pipeline([
    ('tsfresh', TsfreshFeatureExtractor())
])

In [11]:
preprocessor = ColumnTransformer([
    ('cat', cat_pipeline, cat_features),
    ('num', num_pipeline, num_features)
], remainder='passthrough')

In [14]:
model_stage1 = XGBClassifier(n_trees=5, random_state=42) 

In [20]:
df = merged_df.copy()
X = df.drop(columns=["target"])  
y = df["target"]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model_stage1) 
])

In [21]:
merged_df.to_csv('data/processed/cleaned_data.csv', index=False)

In [18]:
model_pipeline

In [None]:

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Output predictions
print("Predictions:", y_pred)

Feature Extraction:   0%|          | 0/30 [00:00<?, ?it/s]

In [19]:
model_pipeline.fit(X_train, y_train)

Feature Extraction:   0%|          | 0/30 [24:07<?, ?it/s]


MemoryError: 