In [21]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import pandas as pd

def create_pipeline():
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor = ColumnTransformer([
        ('num', num_pipeline, ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction']),
        ('cat', cat_pipeline, ['Gender', 'Subscription Type', 'Contract Length'])
    ])
    pipeline = ImbPipeline([
        ('preprocessing', preprocessor),
        ('lda', LDA(n_components=1)),
        ('model', LogisticRegression(solver='liblinear', penalty='l1', max_iter=100, C=0.01, random_state=42))
    ])

    return pipeline

df_train = pd.read_csv('C:/Users/Nat Andrew/Desktop/CustomerChurnPrediction/data/churn_train.csv')
df_train = df_train.dropna(subset=['Churn'])
X_train = df_train.drop(columns=['CustomerID', 'Churn'])
y_train = df_train['Churn']

pipeline = create_pipeline()

pipeline.fit(X_train, y_train)

joblib.dump(pipeline, 'C:/Users/Nat Andrew/Desktop/CustomerChurnPrediction/models/churn_pipeline.pkl')

['C:/Users/Nat Andrew/Desktop/CustomerChurnPrediction/models/churn_pipeline.pkl']