In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('./data/Telco_customer_churn_with_text.csv')

# Initial exploration
print("Dataset shape:", df.shape)
print("\nFeature distribution:\n", df.describe())
print("\nMissing values:\n", df.isnull().sum())

Dataset shape: (7043, 35)

Feature distribution:
         Count      Zip Code     Latitude    Longitude  Tenure Months  \
count  7043.0   7043.000000  7043.000000  7043.000000    7043.000000   
mean      1.0  93521.964646    36.282441  -119.798880      32.371149   
std       0.0   1865.794555     2.455723     2.157889      24.559481   
min       1.0  90001.000000    32.555828  -124.301372       0.000000   
25%       1.0  92102.000000    34.030915  -121.815412       9.000000   
50%       1.0  93552.000000    36.391777  -119.730885      29.000000   
75%       1.0  95351.000000    38.224869  -118.043237      55.000000   
max       1.0  96161.000000    41.962127  -114.192901      72.000000   

       Monthly Charges  Churn Value  Churn Score         CLTV  
count      7043.000000  7043.000000  7043.000000  7043.000000  
mean         64.761692     0.265370    58.699418  4400.295755  
std          30.090047     0.441561    21.525131  1183.057152  
min          18.250000     0.000000     5.000

In [6]:
print("Churn distribution:")
print(df['Churn Label'].value_counts(normalize=True))

Churn distribution:
Churn Label
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64


In [7]:
# data_preprocessing.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer
from sklearn.impute import SimpleImputer
import joblib
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataPreparation:
    def __init__(self):
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.power_transformer = PowerTransformer()
        self.imputer = SimpleImputer(strategy='median')
        
    def load_data(self, file_path):
        """Load and validate the dataset."""
        try:
            df = pd.read_csv(file_path)
            
            logger.info(f"Successfully loaded dataset with shape: {df.shape}")
            return df
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise
            
    def validate_data(self, df):
        """Perform basic data validation checks."""
        validation_report = {
            'missing_values': df.isnull().sum().to_dict(),
            'duplicates': df.duplicated().sum(),
            'data_types': df.dtypes.to_dict()
        }
        
        # Check for invalid values in important columns
        if 'Monthly Charges' in df.columns:
            validation_report['negative_charges'] = (df['Monthly Charges'] < 0).sum()
            
        if 'Tenure Months' in df.columns:
            validation_report['invalid_tenure'] = (df['Tenure Months'] < 0).sum()
            
        logger.info("Data validation completed")
        return validation_report
    
    def engineer_basic_features(self, df):
        """Create basic derived features."""
        df = df.copy()
        # drop unecessary columns
        df = df.drop(['CustomerID',	'Count', 'Country',	'State', 'City', 'Zip Code', 'Lat Long', 'Latitude', 
                      'Longitude', 'Churn Value', 'Churn Score',	'CLTV',	'Churn Reason', 'conversation', 'customer_text'], axis=1)

        
        # Customer value features
        df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')
        df['Revenue_per_Month'] = df['Total Charges'] / df['Tenure Months']
        df['Average_Monthly_Charges'] = df['Total Charges'] / df['Tenure Months']
        df['Charges_Evolution'] = df['Monthly Charges'] - df['Average_Monthly_Charges']
        
        # Service usage features
        service_columns = ['Phone Service', 'Internet Service', 'Online Security',
                          'Online Backup', 'Device Protection', 'Tech Support',
                          'Streaming TV', 'Streaming Movies']
        
        df['Total_Services'] = df[service_columns].apply(
            lambda x: x.str.count('Yes').sum() if x.dtype == 'object' else x.sum(), axis=1
        )
        
        # Customer segments
        df['Value_Segment'] = pd.qcut(df['Monthly Charges'], q=4, 
                                    labels=['Low', 'Medium', 'High', 'Premium'])
        # ... your training code ...
        bins = pd.qcut(df['Monthly Charges'], q=4, retbins=True)[1] # Get the bin edges
        joblib.dump(bins, 'quantile_bins.pkl')  # Save the bins
        
        return df
    
    def engineer_advanced_features(self, df):
        """Create more sophisticated features."""
        df = df.copy()
        
        # Contract risk score
        contract_risk = {'Month-to-month': 3, 'One year': 2, 'Two year': 1}
        df['Contract_Risk_Score'] = df['Contract'].map(contract_risk)
        
        # Payment reliability
        payment_risk = {
            'Electronic check': 3,
            'Mailed check': 2,
            'Bank transfer (automatic)': 1,
            'Credit card (automatic)': 1
        }
        df['Payment_Risk_Score'] = df['Payment Method'].map(payment_risk)
        
        # Service dependency score
        service_weights = {
            'Phone Service': 1,
            'Internet Service': 2,
            'Online Security': 0.5,
            'Online Backup': 0.5,
            'Device Protection': 0.5,
            'Tech Support': 0.5,
            'Streaming TV': 1,
            'Streaming Movies': 1
        }
        
        df['Service_Dependency_Score'] = sum(
            (df[service] == 'Yes').astype(int) * weight
            for service, weight in service_weights.items()
        )
        
        # Loyalty-adjusted value
        df['Loyalty_Adjusted_Value'] = (
            df['Monthly Charges'] * np.log1p(df['Tenure Months'])
        )
        
        return df
    
    def encode_categorical_features(self, df):
        """Encode categorical variables with proper handling."""
        df = df.copy()
        
        # Features for label encoding
        label_encode_cols = ['Gender', 'Contract', 'Payment Method']
        
        # Features for one-hot encoding
        onehot_cols = ['Internet Service', 'Value_Segment']
        
        # Label encoding
        for col in label_encode_cols:
            if col in df.columns:
                self.label_encoders[col] = LabelEncoder()
                df[f'{col}_Encoded'] = self.label_encoders[col].fit_transform(df[col])
        
        # One-hot encoding
        df = pd.get_dummies(df, columns=onehot_cols, prefix=onehot_cols)
        
        return df
    
    def scale_numerical_features(self, df):
        """Scale numerical features with proper handling of skewness."""
        df = df.copy()
        
        # Basic numerical features
        basic_num_cols = ['Monthly Charges', 'Total Charges', 'Tenure Months']
        
        # Derived numerical features
        derived_num_cols = ['Revenue_per_Month', 'Average_Monthly_Charges',
                          'Charges_Evolution', 'Service_Dependency_Score',
                          'Loyalty_Adjusted_Value']
        
        all_num_cols = [col for col in basic_num_cols + derived_num_cols 
                       if col in df.columns]
        
        # Handle missing values
        df[all_num_cols] = self.imputer.fit_transform(df[all_num_cols])
        
        # Apply power transform for heavily skewed features
        df[all_num_cols] = self.power_transformer.fit_transform(df[all_num_cols])
        
        # Standard scaling
        df[all_num_cols] = self.scaler.fit_transform(df[all_num_cols])
        
        return df
    
    def prepare_data(self, file_path):
        """Complete data preparation pipeline."""
        # Load and validate
        df = self.load_data(file_path)
        validation_report = self.validate_data(df)
        
        if validation_report['duplicates'] > 0:
            logger.warning(f"Found {validation_report['duplicates']} duplicate rows")
            df = df.drop_duplicates()
        
        # Feature engineering
        df = self.engineer_basic_features(df)
        df = self.engineer_advanced_features(df)
        
        # Encoding and scaling
        df = self.encode_categorical_features(df)
        df = self.scale_numerical_features(df)
        
        logger.info("Data preparation completed successfully")
        return df, validation_report

if __name__ == "__main__":
    prep = DataPreparation()
    processed_df, validation_report = prep.prepare_data('./data/Telco_customer_churn_with_text.csv')
    processed_df.to_csv('./data/processed_telco_data.csv', index=False)

INFO:__main__:Successfully loaded dataset with shape: (7043, 35)
INFO:__main__:Data validation completed
INFO:__main__:Data preparation completed successfully


In [8]:
# feature_analysis.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_classif
import math
import os

class FeatureAnalysis:
    def __init__(self, df):
        self.df = df
        self.numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
        self.categorical_cols = df.select_dtypes(include=['object']).columns
        
    def plot_feature_distributions(self, save_path=None):
        """Plot distributions of numerical features."""
        num_features = len(self.numerical_cols)
        num_cols = 4  # Set columns to 4 for better spacing
        num_rows = math.ceil(num_features / num_cols)  # Adjust rows dynamically
        
        plt.figure(figsize=(15, 5 * num_rows))  # Adjust figure size based on rows
        for i, col in enumerate(self.numerical_cols, 1):
            plt.subplot(num_rows, num_cols, i)
            sns.histplot(data=self.df, x=col, hue='Churn Label', alpha=0.5)
            plt.title(f'{col} Distribution')
            plt.xticks(rotation=45)
        plt.tight_layout()
        
        if save_path:
            os.makedirs(save_path, exist_ok=True)
            plt.savefig(os.path.join(save_path, 'feature_distributions.png'))

        plt.show()

    def plot_correlation_matrix(self, save_path=None):
        """Plot correlation matrix of numerical features."""
        corr_matrix = self.df[self.numerical_cols].corr()
        plt.figure(figsize=(12, 8))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Feature Correlation Matrix')

        if save_path:
            os.makedirs(save_path, exist_ok=True)
            plt.savefig(os.path.join(save_path, 'correlation_matrix.png'))

        plt.show()
        
    def plot_feature_importance(self, target_col='Churn Label', save_path=None):
        """Calculate and plot feature importance using mutual information."""
        X = self.df[self.numerical_cols]
        y = (self.df[target_col] == 'Yes').astype(int)
        
        # Calculate mutual information scores
        mi_scores = mutual_info_classif(X, y)
        importance_df = pd.DataFrame({
            'Feature': self.numerical_cols,
            'Importance': mi_scores
        }).sort_values('Importance', ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(data=importance_df, x='Importance', y='Feature')
        plt.title('Feature Importance (Mutual Information)')

        if save_path:
            os.makedirs(save_path, exist_ok=True)
            plt.savefig(os.path.join(save_path, 'feature_importance.png'))
        
        return importance_df
    
    def plot_categorical_analysis(self, save_path=None):
        """Analyze categorical features relationship with churn."""
        num_features = len(self.categorical_cols)
        num_cols = 3  # Set to 3 columns for a cleaner view
        num_rows = math.ceil(num_features / num_cols)  # Dynamically adjust rows

        plt.figure(figsize=(15, 5 * num_rows))
        for i, col in enumerate(self.categorical_cols, 1):
            if col != 'Churn Label':
                plt.subplot(num_rows, num_cols, i)
                churn_props = self.df.groupby(col)['Churn Label'].value_counts(normalize=True).unstack()
                if 'Yes' in churn_props.columns:
                    churn_props['Yes'].sort_values().plot(kind='bar')
                else:
                    churn_props.plot(kind='bar', stacked=True)
                plt.title(f'Churn Rate by {col}')
                plt.xticks(rotation=45)
        plt.tight_layout()
        if save_path:
            os.makedirs(save_path, exist_ok=True)
            plt.savefig(os.path.join(save_path, 'categorical_analysis.png'))
        plt.show()
    
    def generate_feature_report(self, save_path=None):
        """Generate a comprehensive feature analysis report."""
        report = {
            'numerical_stats': self.df[self.numerical_cols].describe(),
            'categorical_stats': {
                col: self.df[col].value_counts(normalize=True)
                for col in self.categorical_cols
            },
            'missing_values': self.df.isnull().sum(),
            'correlation_analysis': self.df[self.numerical_cols].corr()
        }
        
        if save_path:
            with open(f'{save_path}/feature_report.txt', 'w') as f:
                for section, data in report.items():
                    f.write(f'\n{section.upper()}\n{"="*50}\n')
                    f.write(str(data))
                    f.write('\n\n')
                    
        return report

if __name__ == "__main__":
    # Load processed data
    df = pd.read_csv('./data/processed_telco_data.csv')
    #df = df.drop(columns=['conversation', 'customer_text'])
    
    # Create analysis object
    analyzer = FeatureAnalysis(df)
    
    # Generate visualizations
    analyzer.plot_feature_distributions('outputs')
    analyzer.plot_correlation_matrix('outputs')
    analyzer.plot_feature_importance(target_col='Churn Label', save_path='outputs')
    analyzer.plot_categorical_analysis('outputs')
    
    # Generate report
    report = analyzer.generate_feature_report('outputs')


ModuleNotFoundError: No module named 'seaborn'

In [9]:
# feature_selection.py
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from boruta import BorutaPy
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FeatureSelector:
    def __init__(self, df, target_col='Churn Label'):
        self.df = df.copy()
        self.target_col = target_col
        self.le = LabelEncoder()
        
        # Ensure only numerical columns are used
        self.numerical_cols = self.df.select_dtypes(include=['number']).columns.tolist()
        if self.target_col in self.numerical_cols:
            self.numerical_cols.remove(self.target_col)
    
    def prepare_target(self):
        """Prepare target variable for selection methods."""
        return self.le.fit_transform(self.df[self.target_col])
    
    def filter_method(self, k=10):
        """Select features using mutual information."""
        y = self.prepare_target()
        X = self.df[self.numerical_cols]
        
        # Select features
        selector = SelectKBest(score_func=mutual_info_classif, k=k)
        selector.fit(X, y)
        
        # Get selected features
        selected_features = X.columns[selector.get_support()].tolist()
        feature_scores = pd.DataFrame({
            'Feature': X.columns,
            'Score': selector.scores_
        }).sort_values('Score', ascending=False)
        
        logger.info(f"Selected {len(selected_features)} features using mutual information")
        return selected_features, feature_scores
    
    def wrapper_method(self, n_features=10):
        """Select features using Recursive Feature Elimination."""
        y = self.prepare_target()
        X = self.df[self.numerical_cols]
        
        # Initialize estimator
        estimator = RandomForestClassifier(n_estimators=100, random_state=42)
        selector = RFE(estimator=estimator, n_features_to_select=n_features)
        
        # Fit selector
        selector.fit(X, y)
        
        # Get selected features
        selected_features = X.columns[selector.support_].tolist()
        feature_ranks = pd.DataFrame({
            'Feature': X.columns,
            'Rank': selector.ranking_
        }).sort_values('Rank')
        
        logger.info(f"Selected {len(selected_features)} features using RFE")
        return selected_features, feature_ranks
    
    def boruta_selection(self):
        """Select features using Boruta algorithm."""
        y = self.prepare_target()
        X = self.df[self.numerical_cols]
        
        # Initialize Random Forest classifier
        rf = RandomForestClassifier(n_jobs=-1, max_depth=5, random_state=42)
        
        # Initialize Boruta
        boruta = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
        
        # Fit Boruta
        boruta.fit(X.values, y)
        
        # Get selected features
        selected_features = X.columns[boruta.support_].tolist()
        feature_ranks = pd.DataFrame({
            'Feature': X.columns,
            'Boruta_Ranking': boruta.ranking_
        }).sort_values('Boruta_Ranking')
        
        logger.info(f"Selected {len(selected_features)} features using Boruta")
        return selected_features, feature_ranks

if __name__ == "__main__":
    df = pd.read_csv('./data/processed_telco_data.csv')
    selector = FeatureSelector(df)
    
    selected_kbest, kbest_scores = selector.filter_method(k=10)
    selected_rfe, rfe_ranks = selector.wrapper_method(n_features=10)
    selected_boruta, boruta_ranks = selector.boruta_selection()
    # store features
    joblib.dump(selected_boruta, "baseline_boruta_features.pkl")
    
    print("Top Features from SelectKBest:", selected_kbest)
    print("Top Features from RFE:", selected_rfe)
    print("Top Features from Boruta:", selected_boruta)


ModuleNotFoundError: No module named 'boruta'

In [10]:
# baseline_model.py

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import logging
from feature_selection import FeatureSelector

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load Data
df = pd.read_csv("./data/processed_telco_data.csv")

# Feature Selection
selector = FeatureSelector(df)
selected_features =  joblib.load("baseline_boruta_features.pkl")  # Use boruta

# Prepare Data
X = df[selected_features]
y = df["Churn Label"].apply(lambda x: 1 if x == "Yes" else 0)  # Convert target to binary

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save Scaler
joblib.dump(scaler, "scaler.pkl")

# Define Model
rf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best Model
best_model = grid_search.best_estimator_
joblib.dump(best_model, "baseline_churn_model.pkl")

# Evaluate Model
y_pred = best_model.predict(X_test_scaled)
logger.info(f"Accuracy: {accuracy_score(y_test, y_pred)}")
logger.info(f"Classification Report:\n {classification_report(y_test, y_pred)}")


ModuleNotFoundError: No module named 'feature_selection'

In [11]:
# prediction_pipeline.py

import pandas as pd
import joblib
from feature_selection import FeatureSelector # Make sure this is importable

def predict_churn(data, model_path, scaler_path):
    """Predicts churn from a JSON-like data structure."""
    try:
        # 1. Convert JSON-like data to DataFrame
        try:  # Handle cases where data might already be a dataframe
            new_df = pd.DataFrame(data)
        except TypeError: # Data is probably a list of dictionaries.
            new_df = pd.DataFrame([data]) if isinstance(data, dict) else pd.DataFrame(data)
        
        # preprocess data
        prep = DataPreparation()
        new_df, validation_report = prep.prepare_data(new_df)

        # 2. Feature Selection (Crucial for consistency)
        selector = FeatureSelector(new_df)
        selected_features, _ = selector.boruta_selection()
        X_new = new_df[selected_features]

        # 3. Handle Missing Columns (Robustness)
        missing_cols = set(X_new.columns) - set(selected_features)
        if missing_cols:
            raise ValueError(f"Missing columns in input data: {missing_cols}")

        # 4. Scale Data
        scaler = joblib.load(scaler_path)
        X_new_scaled = scaler.transform(X_new)

        # 5. Load Model
        model = joblib.load(model_path)

        # 6. Make Predictions
        y_prob = model.predict_proba(X_new_scaled)[:, 1]
        y_pred = (y_prob >= 0.5).astype(int)  # Or adjust threshold

        # 7. Create Response (Dictionary for JSON)
        predictions = {
            "churn_probability": y_prob.tolist(),  # Convert to list for JSON
            "churn_prediction": y_pred.tolist()  # Convert to list for JSON
        }

        # Include original data if needed for debugging/analysis
        # predictions["input_data"] = data  # Convert to list for JSON
        return predictions

    except FileNotFoundError:
        return {"error": "Model or scaler file not found."}
    except ValueError as e:
        return {"error": str(e)}
    except Exception as e:
        return {"error": f"An unexpected error occurred: {str(e)}"}



ModuleNotFoundError: No module named 'feature_selection'

In [12]:
# FastAPI app (main.py)
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel, Field
from typing import List, Dict, Union
# Import predict_churn 
from prediction_pipeline import predict_churn
app = FastAPI()

# Define input data model (Pydantic) for validation
class CustomerData(BaseModel):
    # Define your expected features here:

    Gender: str = None  # Make all fields optional, or provide default values
    SeniorCitizen: str = Field(None, alias="Senior Citizen")
    Partner: str = None
    Dependents: str = None
    TenureMonths: float = Field(None, alias="Tenure Months")
    PhoneService: str = Field(None, alias="Phone Service")
    MultipleLines: str = Field(None, alias="Multiple Lines")
    InternetService: str = Field(None, alias="Internet Service")
    OnlineSecurity: str = Field(None, alias="Online Security")
    OnlineBackup: str = Field(None, alias="Online Backup")
    DeviceProtection: str = Field(None, alias="Device Protection")
    TechSupport: str = Field(None, alias="Tech Support")
    StreamingTV: str = Field(None, alias="Streaming TV")
    StreamingMovies: str = Field(None, alias="Streaming Movies")
    Contract: str = None
    PaperlessBilling: str = Field(None, alias="Paperless Billing")
    PaymentMethod: str = Field(None, alias="Payment Method")
    MonthlyCharges: float = Field(None, alias="Monthly Charges")
    TotalCharges: str = Field(None, alias="Total Charges")
class Config:
        allow_population_by_field_name = True
        
@app.post("/predict")
async def predict_endpoint(request: Request, data: Union[CustomerData, List[CustomerData], Dict, List[Dict]]): 
    print(data)  # Print the data
    import logging
    logging.info(f"Received data: {data}") # Log the data
    # Accept single or batch
    """Endpoint for making churn predictions."""

    model_path = "../models/baseline_churn_model.pkl"
    scaler_path = "../models/scaler.pkl"

    try:
        if isinstance(data, CustomerData):  # Single prediction
            data = data.dict()
        elif isinstance(data, list) and all(isinstance(item, CustomerData) for item in data): # Batch prediction of validated data
            data = [item.dict() for item in data]
        elif isinstance(data, dict): # Single prediction with a dictionary
            data = data
        elif isinstance(data, list) and all(isinstance(item, dict) for item in data): # Batch prediction with list of dictionaries
            data = data
        else:
            raise ValueError("Invalid data format. Please provide a dictionary or a list of dictionaries or Pydantic model instances.")

        predictions = predict_churn(data, model_path, scaler_path)
        return predictions

    except ValueError as ve:
        raise HTTPException(status_code=400, detail=str(ve))  # Bad Request
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")  # Internal Server Error

ModuleNotFoundError: No module named 'fastapi'

In [None]:
# sentiment_extractor.py
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
from dotenv import load_dotenv


# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Load the CSV file
df = pd.read_csv("./data/Telco_customer_churn_with_text.csv")  # Replace "your_file.csv" with the actual file name

def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1)
    predicted_class_id = logits.argmax().item()
    return model.config.id2label[predicted_class_id]

# Apply the sentiment analysis function to the 'customer_text' column
df['customer_sentiment'] = df['customer_text'].apply(get_sentiment)

# Save the updated DataFrame to a new CSV file (optional)
df.to_csv("./data/Telco_customer_churn_with_sentiment.csv", index=False)  # Replace with your desired file name

print(df.head())

DEBUG:filelock:Attempting to acquire lock 2056144632528 on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\3ed34255a7cb8e6706a8bb21993836e99e7b959f.lock
DEBUG:filelock:Lock 2056144632528 acquired on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\3ed34255a7cb8e6706a8bb21993836e99e7b959f.lock


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
DEBUG:filelock:Attempting to release lock 2056144632528 on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\3ed34255a7cb8e6706a8bb21993836e99e7b959f.lock
DEBUG:filelock:Lock 2056144632528 released on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\3ed34255a7cb8e6706a8bb21993836e99e7b959f.lock
DEBUG:filelock:Attempting to acquire lock 2056164699344 on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\b57fe5dfcb8ec3f9bab35ed427c3434e3c7dd1ba.lock
DEBUG:filelock:Lock 2056164699344 acquired on C:\Users\Nitro5\.cache\huggingface\hub\.locks\mod

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 2056164699344 on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\b57fe5dfcb8ec3f9bab35ed427c3434e3c7dd1ba.lock
DEBUG:filelock:Lock 2056164699344 released on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\b57fe5dfcb8ec3f9bab35ed427c3434e3c7dd1ba.lock
DEBUG:filelock:Attempting to acquire lock 2056173369488 on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock
DEBUG:filelock:Lock 2056173369488 acquired on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 2056173369488 on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock
DEBUG:filelock:Lock 2056173369488 released on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock
DEBUG:filelock:Attempting to acquire lock 2056217435536 on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\7c3919835e442510166d267fe7cbe847e0c51cd26d9ba07b89a57b952b49b8aa.lock
DEBUG:filelock:Lock 2056217435536 acquired on C:\Users\Nitro5\.cache\huggingface\hub\.locks\models--distilbert--distilbert-base-uncased-finetuned-sst-2-english\7c3919835e442510166d267fe7cbe847e0c51cd26d9ba07b89a57b952b49b8aa.lock


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# text_features_extractor

from transformers import pipeline
import numpy as np
import pandas as pd
from typing import List, Union
import os

# Load the .env file
# load_dotenv()
# hf_token = os.getenv("HF_TOKEN")


class TextFeatureExtractor:
    def __init__(self, model_name: str = "bert-base-uncased" ): #api_key: str = hf_token
        self.feature_extractor = pipeline(
            "feature-extraction",
            framework="pt",
            model=model_name,
            # api_key=api_key
        )
        
    def get_features(self, texts: Union[str, List[str]], return_df: bool = True) -> Union[np.ndarray, pd.DataFrame]:
        """
        Extract features from one or more texts.
        
        Args:
            texts: Single text string or list of texts
            return_df: If True, returns pandas DataFrame; if False, returns numpy array
            
        Returns:
            Features as either numpy array or pandas DataFrame
        """
        # Convert single text to list
        if isinstance(texts, str):
            texts = [texts]
            
        # Extract features for all texts
        all_features = []
        for text in texts:
            # Extract features using your working approach
            features = self.feature_extractor(text, return_tensors="pt")[0]
            # Convert to numpy and take mean across tokens
            reduced_features = features.numpy().mean(axis=0)
            all_features.append(reduced_features)
            
        # Stack all features into a single array
        feature_array = np.stack(all_features)
            
        if return_df:
            # Convert to DataFrame with feature column names
            feature_columns = [f'feature_{i}' for i in range(feature_array.shape[1])]
            return pd.DataFrame(feature_array, columns=feature_columns)
            
        return feature_array

def process_csv_file(input_file: str, output_file: str):
    """
    Process a CSV file by extracting features from the customer_text column
    and saving results to a new CSV file.
    
    Args:
        input_file: Path to input CSV file
        output_file: Path to output CSV file
    """
    try:
        # Read the CSV file
        print(f"Reading input file: {input_file}")
        df = pd.read_csv(input_file)
        
        # Verify customer_text column exists
        if 'customer_text' not in df.columns:
            raise ValueError("customer_text column not found in CSV file")
        
        # Initialize feature extractor
        print("Initializing feature extractor...")
        extractor = TextFeatureExtractor()
        
        # Extract features from customer_text column
        print("Extracting features...")
        features_df = extractor.get_features(df['customer_text'].tolist())
        
        # Combine original data with features
        print("Combining features with original data...")
        result_df = pd.concat([df, features_df], axis=1)
        
        # Save to new CSV file
        print(f"Saving results to: {output_file}")
        result_df.to_csv(output_file, index=False)
        
        print("Processing completed successfully!")
        print(f"Number of rows processed: {len(df)}")
        print(f"Number of features extracted: {len(features_df.columns)}")
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")

if __name__ == "__main__":
    # Example usage
    input_file = "./data/Telco_customer_churn_with_sentiment.csv"  # Your input CSV file
    output_file = "./data/Telco_customer_churn_with_features.csv"  # Output file name
    
    process_csv_file(input_file, output_file)

In [None]:
# textual_features.py
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from typing import List, Union
import os

# Load the .env file
load_dotenv()

class TextFeatureExtractor:
    def __init__(self, model_name: str = 'all-mpnet-base-v2'):
        """
        Initialize the feature extractor with a sentence transformer model.
        """
        self.model = SentenceTransformer(model_name)
        
    def get_features(self, texts: Union[str, List[str]], 
                    batch_size: int = 32,
                    return_df: bool = True) -> Union[np.ndarray, pd.DataFrame]:
        # Convert single text to list
        if isinstance(texts, str):
            texts = [texts]
        
        # Extract embeddings with batching
        feature_array = self.model.encode(
            texts,
            batch_size=batch_size,
            show_progress_bar=True,
            convert_to_numpy=True
        )
        
        if return_df:
            # Convert to DataFrame with feature column names
            feature_columns = [f'feature_{i}' for i in range(feature_array.shape[1])]
            return pd.DataFrame(feature_array, columns=feature_columns)
        
        return feature_array

def process_csv_file(input_file: str, output_file: str, batch_size: int = 32):
    try:
        # Read the CSV file
        print(f"Reading input file: {input_file}")
        df = pd.read_csv(input_file)
        
        # Verify customer_text column exists
        if 'customer_text' not in df.columns:
            raise ValueError("customer_text column not found in CSV file")
        
        # Initialize feature extractor
        print("Initializing feature extractor...")
        extractor = TextFeatureExtractor()
        
        # Extract features from customer_text column
        print("Extracting features...")
        features_df = extractor.get_features(
            df['customer_text'].tolist(),
            batch_size=batch_size
        )
        
        # Combine original data with features
        print("Combining features with original data...")
        result_df = pd.concat([df, features_df], axis=1)
        
        # Save to new CSV file
        print(f"Saving results to: {output_file}")
        result_df.to_csv(output_file, index=False)
        
        print("Processing completed successfully!")
        print(f"Number of rows processed: {len(df)}")
        print(f"Number of features extracted: {len(features_df.columns)}")
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")

if __name__ == "__main__":
    input_file = "./data/Telco_customer_churn_with_sentiment.csv"
    output_file = "./data/Telco_customer_churn_with_features_.csv"
    
    process_csv_file(input_file, output_file)

In [None]:
# Example

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')
sentence = ["My internet service is so poor these days!"]
embeddings = model.encode(sentence)
print(embeddings)

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load your data
df = pd.read_csv('./data/Telco_customer_churn_with_features.csv')

# Extract embedding features
embedding_features = df.loc[:, 'feature_0':'feature_767'].values

# Apply PCA for dimensionality reduction
pca = PCA(n_components=10)  # Retain 95% of variance
reduced_features = pca.fit_transform(embedding_features)

# Label Encode 'customer_sentiment'
label_encoder = LabelEncoder()
df['sentiment_encoded'] = label_encoder.fit_transform(df['customer_sentiment'])

# Combine reduced features and encoded sentiment
combined_features = np.hstack((reduced_features, df[['sentiment_encoded']].values))

# Create a DataFrame with column names
# Generate names for reduced features
reduced_feature_names = [f'pca_{i}' for i in range(reduced_features.shape[1])]

# Combine all column names
column_names = reduced_feature_names + ['sentiment_encoded']

# Create the DataFrame
processed_df = pd.DataFrame(combined_features, columns=column_names)

# Display the first few rows
print(processed_df.head())


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

df['sentiment_encoded'] = label_encoder.fit_transform(df['customer_sentiment'])
text_features = pd.concat([ df.loc[:, 'feature_0':'feature_767'], df['sentiment_encoded'] ], axis=1)
pre_processed_df = pd.read_csv('./data/processed_telco_data.csv')
concatenated_df = pd.concat([pre_processed_df, processed_df.reset_index(drop=True)], axis=1)
concatenated_df.to_csv("./data/model_data.csv")
concatenated_df.head()

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Online Security,Online Backup,Device Protection,...,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,sentiment_encoded
0,Male,No,No,No,-1.495444,Yes,No,Yes,Yes,No,...,2.607593,-0.021859,-0.055595,-0.023386,-0.181521,0.178755,-0.031445,0.0022,0.002916,0.0
1,Female,No,No,Yes,-1.495444,Yes,No,No,No,No,...,2.670311,-0.065715,-0.013699,-0.185605,0.117329,0.19571,0.159138,0.102147,0.059345,0.0
2,Female,No,No,Yes,-0.926287,Yes,Yes,No,No,Yes,...,2.564315,-0.045868,-0.006414,0.327256,-0.150967,-0.013058,-0.016928,-0.087543,0.042756,0.0
3,Female,No,Yes,Yes,0.080538,Yes,Yes,No,No,Yes,...,-2.869206,1.232879,0.221853,-0.000614,0.002044,0.000956,-0.000875,0.000321,-0.000104,0.0
4,Male,No,No,Yes,0.761599,Yes,Yes,No,Yes,Yes,...,-2.869206,1.232879,0.221853,-0.000614,0.002044,0.000956,-0.000875,0.000321,-0.000104,0.0


In [None]:
# feature_selection.py
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from boruta import BorutaPy
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FeatureSelector:
    def __init__(self, df, target_col='Churn Label'):
        self.df = df.copy()
        self.target_col = target_col
        self.le = LabelEncoder()
        
        # Ensure only numerical columns are used
        self.numerical_cols = self.df.select_dtypes(include=['number']).columns.tolist()
        if self.target_col in self.numerical_cols:
            self.numerical_cols.remove(self.target_col)
    
    def prepare_target(self):
        """Prepare target variable for selection methods."""
        return self.le.fit_transform(self.df[self.target_col])
    
    def filter_method(self, k=10):
        """Select features using mutual information."""
        y = self.prepare_target()
        X = self.df[self.numerical_cols]
        
        # Select features
        selector = SelectKBest(score_func=mutual_info_classif, k=k)
        selector.fit(X, y)
        
        # Get selected features
        selected_features = X.columns[selector.get_support()].tolist()
        feature_scores = pd.DataFrame({
            'Feature': X.columns,
            'Score': selector.scores_
        }).sort_values('Score', ascending=False)
        
        logger.info(f"Selected {len(selected_features)} features using mutual information")
        return selected_features, feature_scores
    
    def wrapper_method(self, n_features=10):
        """Select features using Recursive Feature Elimination."""
        y = self.prepare_target()
        X = self.df[self.numerical_cols]
        
        # Initialize estimator
        estimator = RandomForestClassifier(n_estimators=100, random_state=42)
        selector = RFE(estimator=estimator, n_features_to_select=n_features)
        
        # Fit selector
        selector.fit(X, y)
        
        # Get selected features
        selected_features = X.columns[selector.support_].tolist()
        feature_ranks = pd.DataFrame({
            'Feature': X.columns,
            'Rank': selector.ranking_
        }).sort_values('Rank')
        
        logger.info(f"Selected {len(selected_features)} features using RFE")
        return selected_features, feature_ranks
    
    def boruta_selection(self):
        """Select features using Boruta algorithm."""
        y = self.prepare_target()
        X = self.df[self.numerical_cols]
        
        # Initialize Random Forest classifier
        rf = RandomForestClassifier(n_jobs=-1, max_depth=5, random_state=42)
        
        # Initialize Boruta
        boruta = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42)
        
        # Fit Boruta
        boruta.fit(X.values, y)
        
        # Get selected features
        selected_features = X.columns[boruta.support_].tolist()
        feature_ranks = pd.DataFrame({
            'Feature': X.columns,
            'Boruta_Ranking': boruta.ranking_
        }).sort_values('Boruta_Ranking')
        
        logger.info(f"Selected {len(selected_features)} features using Boruta")
        return selected_features, feature_ranks

if __name__ == "__main__":
    df = concatenated_df
    selector = FeatureSelector(df)
    
    selected_kbest, kbest_scores = selector.filter_method(k=10)
    selected_rfe, rfe_ranks = selector.wrapper_method(n_features=10)
    selected_boruta, boruta_ranks = selector.boruta_selection()

    joblib.dump(selected_boruta, "boruta_features.pkl")
    
    print("Top Features from SelectKBest:", selected_kbest)
    print("Top Features from RFE:", selected_rfe)
    print("Top Features from Boruta:", selected_boruta)


INFO:__main__:Selected 10 features using mutual information
INFO:__main__:Selected 10 features using RFE


Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	25
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	25
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	25
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	25
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	25
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	25
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	25
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	23
Tentative: 	2
Rejected: 	0
Iteration: 	9 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1
Iteration: 	10 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1
Iteration: 	11 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1
Iteration: 	12 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1
Iteration: 	13 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1
Iteration: 	14 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1
Iteration: 	15 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1
Iteration: 	16 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1
I

INFO:__main__:Selected 23 features using Boruta


Iteration: 	99 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1


BorutaPy finished running.

Iteration: 	100 / 100
Confirmed: 	23
Tentative: 	1
Rejected: 	1
Top Features from SelectKBest: ['Contract_Risk_Score', 'Contract_Encoded', 'pca_0', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7']
Top Features from RFE: ['Tenure Months', 'Monthly Charges', 'Total Charges', 'Revenue_per_Month', 'Average_Monthly_Charges', 'Charges_Evolution', 'Contract_Risk_Score', 'Loyalty_Adjusted_Value', 'Contract_Encoded', 'pca_0']
Top Features from Boruta: ['Tenure Months', 'Monthly Charges', 'Total Charges', 'Revenue_per_Month', 'Average_Monthly_Charges', 'Charges_Evolution', 'Total_Services', 'Contract_Risk_Score', 'Payment_Risk_Score', 'Service_Dependency_Score', 'Loyalty_Adjusted_Value', 'Contract_Encoded', 'Payment Method_Encoded', 'pca_0', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9']


In [None]:
# churn_model.py

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import logging
from feature_selection import FeatureSelector

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load Data
df = concatenated_df

# Feature Selection

selected_features = joblib.load('boruta_features.pkl')

# Prepare Data
X = df[selected_features]
y = df["Churn Label"].apply(lambda x: 1 if x == "Yes" else 0)  # Convert target to binary

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save Scaler
joblib.dump(scaler, "churn_scaler.pkl")

# Define Model
rf = RandomForestClassifier(random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
}

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best Model
best_model = grid_search.best_estimator_
joblib.dump(best_model, "./models/churn_model.pkl")

# Evaluate Model
y_pred = best_model.predict(X_test_scaled)
logger.info(f"Accuracy: {accuracy_score(y_test, y_pred)}")
logger.info(f"Classification Report:\n {classification_report(y_test, y_pred)}")


INFO:__main__:Accuracy: 0.7920511000709723
INFO:__main__:Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.64      0.51      0.56       374

    accuracy                           0.79      1409
   macro avg       0.73      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409



In [None]:
# FastAPI app (main.py)
from fastapi import FastAPI, HTTPException, Request
from pydantic import BaseModel, Field
from typing import List, Dict, Union
# Import predict_churn 
from prediction_pipeline import predict_churn
app = FastAPI()

# Define input data model (Pydantic) for validation
class CustomerData(BaseModel):
    # Define your expected features here:

    Gender: str = None  # Make all fields optional, or provide default values
    SeniorCitizen: str = Field(None, alias="Senior Citizen")
    Partner: str = None
    Dependents: str = None
    TenureMonths: float = Field(None, alias="Tenure Months")
    PhoneService: str = Field(None, alias="Phone Service")
    MultipleLines: str = Field(None, alias="Multiple Lines")
    InternetService: str = Field(None, alias="Internet Service")
    OnlineSecurity: str = Field(None, alias="Online Security")
    OnlineBackup: str = Field(None, alias="Online Backup")
    DeviceProtection: str = Field(None, alias="Device Protection")
    TechSupport: str = Field(None, alias="Tech Support")
    StreamingTV: str = Field(None, alias="Streaming TV")
    StreamingMovies: str = Field(None, alias="Streaming Movies")
    Contract: str = None
    PaperlessBilling: str = Field(None, alias="Paperless Billing")
    PaymentMethod: str = Field(None, alias="Payment Method")
    MonthlyCharges: float = Field(None, alias="Monthly Charges")
    TotalCharges: str = Field(None, alias="Total Charges")
class Config:
        allow_population_by_field_name = True
        
@app.post("/predict")
async def predict_endpoint(request: Request, data: Union[CustomerData, List[CustomerData], Dict, List[Dict]]): 
    print(data)  # Print the data
    import logging
    logging.info(f"Received data: {data}") # Log the data
    # Accept single or batch
    """Endpoint for making churn predictions."""

    model_path = "churn_model.pkl"
    scaler_path = "churn_scaler.pkl"

    try:
        if isinstance(data, CustomerData):  # Single prediction
            data = data.dict()
        elif isinstance(data, list) and all(isinstance(item, CustomerData) for item in data): # Batch prediction of validated data
            data = [item.dict() for item in data]
        elif isinstance(data, dict): # Single prediction with a dictionary
            data = data
        elif isinstance(data, list) and all(isinstance(item, dict) for item in data): # Batch prediction with list of dictionaries
            data = data
        else:
            raise ValueError("Invalid data format. Please provide a dictionary or a list of dictionaries or Pydantic model instances.")

        predictions = predict_churn(data, model_path, scaler_path)
        return predictions

    except ValueError as ve:
        raise HTTPException(status_code=400, detail=str(ve))  # Bad Request
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")  # Internal Server Error

In [62]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

def extract_sentiment(text_column, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
    """
    Extracts sentiment from a text column and returns the updated DataFrame.
    
    Args:
        text_column (pd.Series): Column containing text data
        model_name (str): Hugging Face model name for sentiment analysis
        
    Returns:
        pd.Series: Series containing sentiment labels
    """
    # Load model and tokenizer once
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    # Function to get sentiment for a single text
    def get_sentiment(text):
        # Handle NaN values gracefully
        if pd.isna(text) or text.strip() == "":
            return "Neutral"
        
        # Tokenize and get model outputs
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get predicted label
        logits = outputs.logits
        predicted_class_id = logits.argmax().item()
        return model.config.id2label[predicted_class_id]
    
    # Apply sentiment extraction to the text column
    return text_column.apply(get_sentiment)


In [63]:
df = pd.read_csv("./data/Telco_customer_churn_with_text.csv")
df['customer_sentiment'] = extract_sentiment(df['customer_text'])

In [64]:
df.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason,conversation,customer_text,customer_sentiment
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,53.85,108.15,Yes,1,86,3239,Competitor made better offer,Customer: I have a question about my services....,I have a question about my services. I'm a new...,NEGATIVE
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,70.7,151.65,Yes,1,67,2701,Moved,Customer: I have a question about my services....,I have a question about my services. I'm a new...,NEGATIVE
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,99.65,820.5,Yes,1,86,5372,Moved,Customer: I have a question about my services....,I have a question about my services. I'm a new...,NEGATIVE
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,104.8,3046.05,Yes,1,84,5003,Moved,Customer: I need help with my Fiber optic serv...,I need help with my Fiber optic service setup....,NEGATIVE
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,103.7,5036.3,Yes,1,89,5340,Competitor had better devices,Customer: I need help with my Fiber optic serv...,I need help with my Fiber optic service setup....,NEGATIVE


In [20]:
import numpy as np
import pandas as pd
from transformers import pipeline
from sklearn.decomposition import PCA

def extract_and_reduce_features(text_column, model_name="bert-base-uncased", n_components=10):
    """
    Extracts text features using Hugging Face model, performs PCA to reduce dimensions,
    and returns a DataFrame with the reduced features.
    
    Args:
        text_column (pd.Series): Column containing text data
        model_name (str): Hugging Face model name for feature extraction
        n_components (int): Number of PCA components
        
    Returns:
        pd.DataFrame: Original DataFrame with new feature columns added
    """
    # Load feature extraction pipeline once
    feature_extractor = pipeline(
        "feature-extraction",
        model=model_name,
        framework="pt"
    )
    
    # Function to extract features for one text
    def get_features(text):
        # Handle NaN values gracefully
        if pd.isna(text) or text.strip() == "":
            return np.zeros((768,))  # Return zero vector for empty text
        
        # Extract features and take mean across tokens
        features = feature_extractor(text, return_tensors="pt")[0]
        reduced_features = features.numpy().mean(axis=0)
        return reduced_features
    
    # Apply feature extraction to the text column
    feature_matrix = np.stack(text_column.apply(get_features))
    
    # Perform PCA to reduce the dimensionality to n_components
    pca = PCA(n_components=1)
    reduced_features = pca.fit_transform(feature_matrix)
    
    # Create DataFrame for the reduced features
    feature_columns = [f'pca_feature_{i}' for i in range(n_components)]
    feature_df = pd.DataFrame(reduced_features, columns=feature_columns)
    
    return feature_df


In [16]:
customer_data = {
    'customer_text': [
        "My internet keeps disconnecting every 30 minutes. Very frustrating!"
        # "The service is excellent, never had any issues with connectivity.",
        # "WiFi signal is weak in my bedroom, need a solution urgently.",
        # "Installation was quick and the internet speed is great.",
        # "Cannot connect to the network during peak hours.",
        # "Best internet service I've had in years. Streaming works perfectly.",
        # "Router keeps restarting on its own. Please help!",
        # "Download speeds are much slower than what I'm paying for.",
        # "Great customer service, they fixed my connection issue quickly.",
        # "Having trouble connecting multiple devices at once."
    ]
}

# Create DataFrame
df = pd.DataFrame(customer_data)


In [21]:
# Step 1: Extract features and reduce them to 10 components
reduced_features_df = extract_and_reduce_features(df['customer_text'])

# Step 2: Concatenate the reduced features with the original DataFrame
df = pd.concat([df, reduced_features_df], axis=1)

# Step 3: Display the first few rows to check the result
print(df.head())


Device set to use cpu
  explained_variance_ = (S**2) / (n_samples - 1)


ValueError: Shape of passed values is (1, 1), indices imply (1, 10)

In [22]:
from transformers import pipeline
import numpy as np
import pandas as pd
from typing import List, Union
from sklearn.random_projection import GaussianRandomProjection

class TextFeatureExtractor:
    def __init__(self, 
                 model_name: str = "bert-base-uncased", 
                 api_key: str = 'hf_qcFlKgxJRkwTpPggNQnwTLLPpXV',
                 n_components: int = 10):
        self.feature_extractor = pipeline(
            "feature-extraction",
            framework="pt",
            model=model_name,
            api_key=api_key
        )
        # Initialize random projection
        self.random_projection = GaussianRandomProjection(n_components=n_components, random_state=42)
        
    def get_features(self, text: str, return_df: bool = True) -> Union[np.ndarray, pd.DataFrame]:
        """
        Extract features from text and reduce dimensions using Random Projection.
        
        Args:
            text: Input text string
            return_df: If True, returns pandas DataFrame; if False, returns numpy array
            
        Returns:
            Reduced features as either numpy array or pandas DataFrame
        """
        # Extract features
        features = self.feature_extractor(text, return_tensors="pt")[0]
        reduced_features = features.numpy().mean(axis=0)
        
        # Reshape for random projection (needs 2D array)
        features_2d = reduced_features.reshape(1, -1)
        
        # Apply random projection
        projected_features = self.random_projection.fit_transform(features_2d)
        
        if return_df:
            feature_columns = [f'projected_feature_{i+1}' for i in range(projected_features.shape[1])]
            return pd.DataFrame(projected_features, columns=feature_columns)
        
        return projected_features

# Example usage
if __name__ == "__main__":
    # Initialize extractor
    extractor = TextFeatureExtractor(n_components=10)
    
    # Single text example
    text = "My internet keeps going down. What's the problem!?"
    
    # Get reduced features
    features_df = extractor.get_features(text)
    print("\nFeature DataFrame shape:", features_df.shape)
    print("\nReduced features:")
    print(features_df)

Device set to use cpu



Feature DataFrame shape: (1, 10)

Reduced features:
   projected_feature_1  ...  projected_feature_10
0             0.040793  ...             -0.734421

[1 rows x 10 columns]


In [1]:
data = {
  "Gender": "Female",
        "Senior Citizen": "No",
        "Partner": "Yes",
        "Dependents": "No",
        "Tenure Months": 24,
        "Phone Service": "Yes",
        "Multiple Lines": "No",
        "Internet Service": "DSL",
        "Online Security": "Yes",
        "Online Backup": "No",
        "Device Protection": "Yes",
        "Tech Support": "No",
        "Streaming TV": "Yes",
        "Streaming Movies": "No",
        "Contract": "Month-to-month",
        "Paperless Billing": "Yes",
        "Payment Method": "Electronic check",
        "Monthly Charges": 65.6,
        "Total Charges": 1576.45,
  "customer_text": "My internet is always slow. What's happening with your service?"
}


In [2]:
import pandas as pd
from data_preparation3 import DataPreparation
prep = DataPreparation()
new_df = pd.DataFrame([data], index=[0]) # Add an index!
new_df, validation_report = prep.prepare_data(new_df)

  from .autonotebook import tqdm as notebook_tqdm
INFO:data_preparation3:Successfully loaded dataset with shape: (1, 20)
INFO:data_preparation3:Data validation completed
Device set to use cpu
INFO:data_preparation3:Data preparation completed successfully


In [3]:
new_df

Unnamed: 0,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Online Security,Online Backup,Device Protection,...,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9
0,Female,No,Yes,No,0.0,Yes,No,Yes,No,Yes,...,-0.243202,-0.283105,-2.904098,4.149575,-5.649469,-1.911078,-0.257134,-1.217405,1.038977,-0.025525


In [15]:
# Run the entire data preparation process
from data_preparation import DataPreparation
prep = DataPreparation()

df = pd.read_csv("./data/Telco_customer_churn_with_text.csv")
new_df, validation_report = prep.prepare_data(df)

INFO:data_preparation:Successfully loaded dataset with shape: (7043, 35)
INFO:data_preparation:Data validation completed
Device set to use cpu
INFO:data_preparation:Data preparation completed successfully
