# Import

In [30]:
import pandas as pd
import os

In [31]:
# Get the current file's directory
current_dir = ocurrent_dir = os.getcwd()

project_root = os.path.dirname(current_dir)

data_dir = os.path.join(project_root, 'data')
customers_file = os.path.join(data_dir, 'customers_raw.csv')
complaints_file = os.path.join(data_dir, 'complaints_raw.csv')

customers_df = pd.read_csv(customers_file)
complaints_df = pd.read_csv(complaints_file)

# Clean

In [32]:
# #inspect unique values and missing values

# for col in customers_df.columns:
#     print('----\n', customers_df[col].value_counts(), '\n\n', customers_df[col].isna().sum())

In [33]:
# New customers have positive monthly charges but no total charges.
customers_df.loc[customers_df['TotalCharges'] == ' ', 'TotalCharges'] = customers_df.loc[customers_df['TotalCharges'] == ' ', 'MonthlyCharges']

In [34]:
## Customer's data

# Column names and variable types
customers_df = customers_df.rename(columns={'customerID': 'CustomerId', 'tenure':'Tenure', 'gender':'Gender'})
customers_df['CustomerId'] = customers_df['CustomerId'].astype(str)
customers_df['Gender'] = customers_df['Gender'].astype(str)
customers_df['SeniorCitizen'] = customers_df['SeniorCitizen'].astype(int)
customers_df['Dependents'] = customers_df['Dependents'].astype(str).map({'No':0, 'Yes':1})
customers_df['Tenure'] = customers_df['Tenure'].astype(int)
customers_df['PhoneService'] = customers_df['PhoneService'].astype(str).map({'No':0, 'Yes':1})
customers_df['MultipleLines'] = customers_df['MultipleLines'].astype('category')
customers_df['InternetService'] = customers_df['InternetService'].astype('category')
customers_df['OnlineSecurity'] = customers_df['OnlineSecurity'].astype('category')
customers_df['OnlineBackup'] = customers_df['OnlineBackup'].astype('category')
customers_df['DeviceProtection'] = customers_df['DeviceProtection'].astype('category')
customers_df['TechSupport'] = customers_df['TechSupport'].astype('category')
customers_df['StreamingTV'] = customers_df['StreamingTV'].astype('category')
customers_df['StreamingMovies'] = customers_df['StreamingMovies'].astype('category')
customers_df['Contract'] = customers_df['Contract'].astype('category')
customers_df['PaperlessBilling'] = customers_df['PaperlessBilling'].astype(str).map({'No':0, 'Yes':1})
customers_df['PaymentMethod'] = customers_df['PaymentMethod'].astype('category')
customers_df['MonthlyCharges'] = customers_df['MonthlyCharges'].astype(float)
customers_df['TotalCharges'] = customers_df['TotalCharges'].astype(float)
customers_df['Churn'] = customers_df['Churn'].astype(str).map({'No':0, 'Yes':1})

In [35]:
## Complaints data

# Column names and variable types
complaints_df = complaints_df.rename(columns={'customerID': 'CustomerId', 'complaint':'Complaint', 'complaint_number':'ComplaintNumber'})
complaints_df['CustomerId'] = complaints_df['CustomerId'].astype(str)
complaints_df['Complaint'] = complaints_df['Complaint'].astype(str)
complaints_df['ComplaintNumber'] = complaints_df['ComplaintNumber'].astype(int)

# Eng. features

In [36]:
## Customer df

# 1. ChangedPlan
customers_df['ChangedPlan'] = 'No'
expected_total = customers_df['Tenure'] * customers_df['MonthlyCharges']
customers_df.loc[customers_df['TotalCharges'] != expected_total, 'ChangedPlan'] = 'Yes'

# 2. ChangedPlanPositive
customers_df['ChangedPlanPositive'] = 'No'
customers_df.loc[customers_df['TotalCharges'] < expected_total, 'ChangedPlanPositive'] = 'Yes'

# 3. ChangedPlanNegative
customers_df['ChangedPlanNegative'] = 'No'
customers_df.loc[customers_df['TotalCharges'] > expected_total, 'ChangedPlanNegative'] = 'Yes'

# 4. InternetServicesDensity
# First, get all service columns
internet_services_columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                             'TechSupport', 'StreamingTV', 'StreamingMovies']

# Count services that are not "No" and divide by 7
customers_df['InternetServicesDensity'] = customers_df[internet_services_columns].apply(
    lambda row: sum(value != 'No' and value != 'No internet service' for value in row) / len(internet_services_columns), 
    axis=1
)

# 5. ContractLengthMonths
customers_df['ContractLengthMonths'] = customers_df['Contract'].astype(str).map({'Month-to-month':1, 'One year':12, 'Two year':24})

# 6. ContractLifecycle
customers_df['ContractLifecycle'] = customers_df.apply(
    lambda row: (row['Tenure'] % row['ContractLengthMonths']) / row['ContractLengthMonths'], 
    axis=1
)

In [37]:
from textblob import TextBlob
import re
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Text preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # Convert to lowercase
    text = text.lower()

    # Handle contractions
    text = text.replace("'m", " am")
    text = text.replace("n't", " not")
    text = text.replace("'re", " are")
    text = text.replace("'s", " is")
    text = text.replace("'ll", " will")
    text = text.replace("'ve", " have")
    text = text.replace("'d", " would")

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Function to get key terms by POS tag for churned customers
def get_key_terms_by_pos(complaints_df, customers_df, pos_tag_type, top_n=10):
    # Merge complaints with customer data
    merged_df = pd.merge(complaints_df, customers_df[['CustomerId', 'Churn']], on='CustomerId', how='left')
    
    # Get complaints from churned customers
    churned_complaints = merged_df[merged_df['Churn'] == 0]['Complaint'].dropna()
    
    # Process all complaints
    processed_complaints = [preprocess_text(text) for text in churned_complaints]
    
    # Get all words with specified POS tag
    key_terms = []
    for text in processed_complaints:
        # POS tag the text
        tagged = pos_tag(word_tokenize(text))
        # Extract words with matching POS tag
        if pos_tag_type == 'NOUN':
            pos_filter = ['NN', 'NNS', 'NNP', 'NNPS']
        elif pos_tag_type == 'VERB':
            pos_filter = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
        elif pos_tag_type == 'ADJ':
            pos_filter = ['JJ', 'JJR', 'JJS']
        
        key_terms.extend([word.lower() for word, tag in tagged if tag in pos_filter])
    
    # Get most common terms
    return [word for word, _ in Counter(key_terms).most_common(top_n)]

def count_terms(text, terms):
    if not text:
        return 0
    words = text.split()
    return sum(words.count(term) for term in terms)


# Create the features

# 0. ProcessedComplaint
complaints_df['ProcessedComplaint'] = complaints_df['Complaint'].fillna('').apply(preprocess_text)

# 1. ComplaintLength
complaints_df['ComplaintLength'] = complaints_df['Complaint'].fillna('').str.len()

# Get key terms for churned customers
key_nouns = get_key_terms_by_pos(complaints_df, customers_df, 'NOUN')
key_verbs = get_key_terms_by_pos(complaints_df, customers_df, 'VERB')
key_adjectives = get_key_terms_by_pos(complaints_df, customers_df, 'ADJ')

# Process complaints
processed_complaints = complaints_df['ProcessedComplaint']


# 2. Sentiment
complaints_df['Sentiment'] = complaints_df['Complaint'].fillna('').apply(
    lambda x: TextBlob(str(x)).sentiment.polarity
)

# 3. KeyVerbsCount
complaints_df['KeyVerbsCount'] = processed_complaints.apply(
    lambda x: count_terms(x, key_verbs)
)

# 4. KeyNounsCount
complaints_df['KeyNounsCount'] = processed_complaints.apply(
    lambda x: count_terms(x, key_nouns)
)

# 5. KeyAdjectivesCount
complaints_df['KeyAdjectivesCount'] = processed_complaints.apply(
    lambda x: count_terms(x, key_adjectives)
)

[nltk_data] Downloading package punkt to /Users/marco/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/marco/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Save CSV

In [9]:
complaints_df.to_csv('/Users/marco/Documents/python_projects/churn_bcgx/data/complaints_processed.csv', index=False)
customers_df.to_csv('/Users/marco/Documents/python_projects/churn_bcgx/data/customers_processed.csv', index=False)