In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv(r'C:\Users\masha\Downloads\balanced_complaints.csv')

In [3]:
df_no_text = df.drop(columns=['Consumer complaint narrative'])

In [4]:
print(df_no_text.head())

  Date received          Product                                  Issue  \
0     3/25/2016  Debt collection     False statements or representation   
1      5/8/2015  Debt collection  Cont'd attempts collect debt not owed   
2     1/21/2016  Debt collection                  Communication tactics   
3      5/1/2015  Debt collection  Cont'd attempts collect debt not owed   
4     9/26/2015      Credit card                   Credit determination   

                             Company public response Date sent to company  \
0  Company believes it acted appropriately as aut...            3/26/2016   
1  Company can't verify or dispute the facts in t...            5/11/2015   
2  Company can't verify or dispute the facts in t...            1/21/2016   
3  Company believes it acted appropriately as aut...             5/1/2015   
4   Company chooses not to provide a public response            9/29/2015   

      Company response to consumer Timely response?  
0          Closed with explanati

In [5]:
product_counts = df['Product'].value_counts().nunique()
print(product_counts)

19


In [6]:
issue_counts = df['Issue'].value_counts().nunique()
print(issue_counts)

64


In [7]:
public_response_counts = df['Company public response'].value_counts().nunique()
print(public_response_counts)

11


In [8]:
consumer_response_counts = df['Company response to consumer'].value_counts().nunique()
print(consumer_response_counts)

4


In [9]:
from pandas.tseries.holiday import USFederalHolidayCalendar


In [10]:
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=df['Date received'].min(), end=df['Date sent to company'].max())

In [11]:
def calculate_business_days(row):
    # Generate business days between the two dates
    business_days = pd.bdate_range(row['Date received'], row['Date sent to company'])
    # Exclude holidays
    valid_business_days = business_days.difference(holidays)
    return len(valid_business_days)

In [12]:
df['Business Days'] = df.apply(calculate_business_days, axis=1)


In [13]:
print(df.head())

  Date received          Product                                  Issue  \
0     3/25/2016  Debt collection     False statements or representation   
1      5/8/2015  Debt collection  Cont'd attempts collect debt not owed   
2     1/21/2016  Debt collection                  Communication tactics   
3      5/1/2015  Debt collection  Cont'd attempts collect debt not owed   
4     9/26/2015      Credit card                   Credit determination   

                        Consumer complaint narrative  \
0  Debt amounts are not accurate when attempting ...   
1  Focus Receivables Management, XXXX, GA XXXX Ph...   
2  My mother, my sister, and I have received mult...   
3  I started to receive correspondence from Natio...   
4                      XXXX ruined my credit rating.   

                             Company public response Date sent to company  \
0  Company believes it acted appropriately as aut...            3/26/2016   
1  Company can't verify or dispute the facts in t...      

In [15]:
#remove outliers need to decide how we're handling outliers
Q1 = df['Business Days'].quantile(0.25)
Q3 = df['Business Days'].quantile(0.75)
IQR = Q1 - Q3

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [16]:
df_cleaned = df[(df['Business Days'] >= lower_bound) & (df['Business Days'] <= upper_bound)]


In [None]:
# One-hot encode categorical columns
Product
Issue
Company public response
Company response to consumer


In [17]:
df_timely_response = df_cleaned[['Product', 'Issue', 'Company public response', 'Company response to consumer', 'Timely response?']]

In [19]:
print(df_timely_response)

                                                Product  \
0                                       Debt collection   
2                                       Debt collection   
3                                       Debt collection   
5             Payday loan, title loan, or personal loan   
8                           Credit card or prepaid card   
...                                                 ...   
7317                                    Debt collection   
7318                                    Debt collection   
7321                                      Consumer Loan   
7322                                    Debt collection   
7323  Credit reporting, credit repair services, or o...   

                                           Issue  \
0             False statements or representation   
2                          Communication tactics   
3          Cont'd attempts collect debt not owed   
5     Charged fees or interest you didn't expect   
8                              

In [22]:
df_timely_response['Timely response?'] = df_timely_response['Timely response?'].map({'Yes': 1, 'No': 0})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_timely_response['Timely response?'] = df_timely_response['Timely response?'].map({'Yes': 1, 'No': 0})


In [23]:
columns_to_encode = ['Product', 'Issue', 'Company public response', 'Company response to consumer']

In [24]:
df_timely_response_encoded = pd.get_dummies(df_timely_response, columns=columns_to_encode)

In [26]:
print(df_timely_response_encoded.head())

   Timely response?  Product_Bank account or service  \
0                 0                            False   
2                 1                            False   
3                 1                            False   
5                 1                            False   
8                 1                            False   

   Product_Checking or savings account  Product_Consumer Loan  \
0                                False                  False   
2                                False                  False   
3                                False                  False   
5                                False                  False   
8                                False                  False   

   Product_Credit card  Product_Credit card or prepaid card  \
0                False                                False   
2                False                                False   
3                False                                False   
5                Fal

In [30]:
# Define features (X) and target (y)
X = df_timely_response_encoded.drop(['Timely response?'], axis=1)
y = df_timely_response_encoded['Timely response?']

# Split the data into train, test and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [33]:
# Initialize Gradient Boosting Classifier
gb = GradientBoostingClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10]
}

In [34]:
# Set up GridSearchCV
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)


In [35]:
# Fit the model
print("Tuning hyperparameters with GridSearchCV...")
grid_search.fit(X_train, y_train)

Tuning hyperparameters with GridSearchCV...
Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [37]:
# Best parameters
print("Best hyperparameters:", grid_search.best_params_)

Best hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 50}


In [39]:
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.7510373443983402

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.78      0.76       473
           1       0.78      0.72      0.75       491

    accuracy                           0.75       964
   macro avg       0.75      0.75      0.75       964
weighted avg       0.75      0.75      0.75       964



In [40]:
# Evaluate on test set
y_test_pred = best_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

Test Accuracy: 0.7461139896373057

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.79      0.75       467
           1       0.78      0.71      0.74       498

    accuracy                           0.75       965
   macro avg       0.75      0.75      0.75       965
weighted avg       0.75      0.75      0.75       965



In [None]:
#repeat for predicting response time