In [3]:
import pandas as pd

# Load datasets
customers_df = pd.read_csv('customers2afd6ea.csv')
reason_df = pd.read_csv('reason18315ff.csv')
sentiment_df = pd.read_csv('sentiment_statisticscc1e57a.csv')
test_df = pd.read_csv('testbc7185d.csv')
call_logs_df = pd.read_csv('callsf0d4f5a.csv') 

# Merge datasets on `call_id`
merged_df = call_logs_df.merge(reason_df, on='call_id', how='left')
merged_df = merged_df.merge(sentiment_df, on=['call_id', 'agent_id'], how='left')
merged_df = merged_df.merge(customers_df, on='customer_id', how='left')

In [5]:
# Convert datetime columns to datetime format
merged_df['call_start_datetime'] = pd.to_datetime(merged_df['call_start_datetime'])
merged_df['agent_assigned_datetime'] = pd.to_datetime(merged_df['agent_assigned_datetime'])
merged_df['call_end_datetime'] = pd.to_datetime(merged_df['call_end_datetime'])

# Calculate AHT and AST in seconds
merged_df['AHT'] = (merged_df['call_end_datetime'] - merged_df['agent_assigned_datetime']).dt.total_seconds()
merged_df['AST'] = (merged_df['agent_assigned_datetime'] - merged_df['call_start_datetime']).dt.total_seconds()

# Remove rows with missing target for training data
training_data = merged_df.dropna(subset=['primary_call_reason'])

In [11]:
# One-hot encoding for categorical variables
training_data_encoded = pd.get_dummies(training_data, 
                                       columns=['agent_tone', 'customer_tone'],
                                       drop_first=True)

# Select features and target
X = training_data_encoded[['AHT', 'AST', 'average_sentiment', 'silence_percent_average'] +
                          [col for col in training_data_encoded.columns if 'tone_' in col]]
y = training_data_encoded['primary_call_reason']

# Fill missing values in numerical features, e.g., `average_sentiment`
X.loc[:, 'average_sentiment'] = X['average_sentiment'].fillna(X['average_sentiment'].median())

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Validate model
y_val_pred = rf_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.319255869777211


In [15]:
# Prepare test data by merging with other datasets
test_merged = test_df.merge(call_logs_df, on='call_id', how='left')
test_merged = test_merged.merge(sentiment_df, on=['call_id', 'agent_id'], how='left')
test_merged = test_merged.merge(customers_df, on='customer_id', how='left')

# Feature engineering on test data
test_merged['call_start_datetime'] = pd.to_datetime(test_merged['call_start_datetime'])
test_merged['agent_assigned_datetime'] = pd.to_datetime(test_merged['agent_assigned_datetime'])
test_merged['call_end_datetime'] = pd.to_datetime(test_merged['call_end_datetime'])

# Calculate AHT and AST for test data
test_merged['AHT'] = (test_merged['call_end_datetime'] - test_merged['agent_assigned_datetime']).dt.total_seconds()
test_merged['AST'] = (test_merged['agent_assigned_datetime'] - test_merged['call_start_datetime']).dt.total_seconds()

# Encode categorical features in the test data
test_encoded = pd.get_dummies(test_merged, 
                              columns=['agent_tone', 'customer_tone'],
                              drop_first=True)

# Align columns with the training data
test_encoded = test_encoded.reindex(columns=X.columns, fill_value=0)

# Predict on test data
test_predictions = rf_model.predict(test_encoded)

# Prepare submission
test_df['primary_call_reason'] = test_predictions
test_df.to_csv('test_yourname.csv', index=False)