In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

ivr=pd.read_csv("/Users/navyabingi/Downloads/combined_customer_interactions.csv")


In [None]:
import pandas as pd


ivr['start_time'] = pd.to_datetime(ivr['start_time'])
traffic_data = ivr.set_index('start_time').resample('H').size().reset_index(name='traffic_count')
import numpy as np


mean_traffic = traffic_data['traffic_count'].mean()


traffic_data['cusum_pos'] = np.maximum(0, (traffic_data['traffic_count'] - mean_traffic).cumsum())
traffic_data['cusum_neg'] = np.maximum(0, (mean_traffic - traffic_data['traffic_count']).cumsum())
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 6))

plt.plot(traffic_data['start_time'], traffic_data['traffic_count'], label='Traffic Count', color='blue')

plt.plot(traffic_data['start_time'], traffic_data['cusum_pos'], label='CUSUM Positive', color='red')
plt.plot(traffic_data['start_time'], traffic_data['cusum_neg'], label='CUSUM Negative', color='green')
plt.axhline(y=mean_traffic, color='gray', linestyle='--', label='Mean Traffic')
plt.title('CUSUM Chart')
plt.xlabel('Time')
plt.ylabel('Traffic')
plt.legend()
plt.grid(True)
plt.show()
plt.show()


In [None]:
unique_values = ivr["dtl_2_char"].unique()


categories = {
    'Billing & Payments': ['BILLING', 'PAYMENT', 'BALANCE', 'COLLECTIONS', 'FICO'],
    'Technical Support': ['TECH', 'SUPPORT', 'TROUBLESHOOTING', 'OUTAGE'],
    'Sales & Retention': ['SALES', 'RETENTION', 'MARKETING'],
    'Service Changes': ['CHANGE', 'NEW', 'MOVE'],
    'Appointment & Installation': ['APPOINTMENT', 'INSTALL', 'SELF INSTALL'],
    'Promotions & Campaigns': ['CAMPAIGN', 'PROMOTION', 'OFFER'],
    'Security & Abuse': ['DMCA', 'COMPROMISE', 'ABUSE'],
    'Specialty Services': ['COX MOBILE', 'HOMELIFE', 'GIGABIT', 'WEB HOSTING', 'COX BUSINESS'],
    'Unknown/Other': ['UNKNOWN']
}


def categorize_value(value):
    for category, keywords in categories.items():
        if any(keyword in value for keyword in keywords):
            return category
    return 'Unknown/Other'


category_mapping = {value: categorize_value(value) for value in unique_values}
ivr['Category'] = ivr['dtl_2_char'].map(category_mapping)


ivr['start_time'] = pd.to_datetime(ivr['start_time'])
ivr['end_time'] = pd.to_datetime(ivr['end_time'])


ivr['duration'] = (ivr['end_time'] - ivr['start_time']).dt.total_seconds()/60


ivr['hour'] = ivr['start_time'].dt.hour

Q1 = ivr['duration'].quantile(0.25)  # 25th percentile
Q3 = ivr['duration'].quantile(0.75)  # 75th percentile
IQR = Q3 - Q1                       # Interquartile range

upper_bound = Q3 + 1.5 * IQR


ivr = ivr[ivr['duration'] <= upper_bound]
ivr=ivr[ivr['Category'] != 'Unknown/Other']
ivr[ivr['dtl_3_char']=="TR"]


In [None]:
ivr= pd.get_dummies(ivr, columns=['category', 'product', 'lob','dtl_3_char','dtl_4_char','Category'], dtype=int)
ivr=ivr.drop(['account_nbr', 'customer_key', 'last_update_dt','start_time', 'end_time','transaction_id', 'source_id_num', 'source_id_char','dtl_1_num',
       'dtl_1_char','dtl_2_char','dtl_3_num','dtl_5_num', 'dtl_5_char', 'dtl_6_num', 'dtl_6_char','dt', 'interaction_type_key','dtl_2_num'],axis=1)
ivr.columns

In [None]:
df_clean = ivr.drop(['site_id', 'account_nbr', 'customer_key', 'last_update_dt','start_time', 'end_time','transaction_id', 'source_id_num', 'source_id_char','dtl_1_num',
       'dtl_1_char','dtl_2_char','dtl_3_num','dtl_5_num', 'dtl_5_char', 'dtl_6_num', 'dtl_6_char','dt'], axis=1)

df_clean = df_clean.sample(n=50000, random_state=42)


scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_clean)


kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust n_clusters as needed
labels = kmeans.fit_predict(X_scaled)

tsne = TSNE(n_components=3, perplexity=30, random_state=42, n_iter=300)
X_tsne = tsne.fit_transform(X_scaled)


plt.figure(figsize=(10, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='viridis', s=10)
plt.colorbar(label='Cluster Label')
plt.title('t-SNE Visualization of Clusters')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt


features = [
    'dtl_4_num', 'duration', 'hour', 'category_AS', 'category_COL', 
    'category_OTH', 'category_RET', 'category_SLS', 'category_TS', 
    'product_C', 'product_D', 'product_H', 'product_T', 'product_U', 
    'product_W', 'lob_C', 'lob_R', 'lob_U', 'dtl_3_char_EN', 'dtl_3_char_TR', 
    'dtl_4_char_HangUp', 'dtl_4_char_TRANSFER', 'Category_Appointment & Installation', 
    'Category_Billing & Payments', 'Category_Promotions & Campaigns', 
    'Category_Sales & Retention', 'Category_Security & Abuse', 
    'Category_Service Changes', 'Category_Specialty Services', 
    'Category_Technical Support'
]
new=ivr.sample(n=500000, random_state=42)


X = new[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


model = OneClassSVM(kernel='rbf', nu=0.05, gamma='auto')
model.fit(X_scaled)

predictions = model.predict(X_scaled)


new['outage_prediction'] = predictions


plt.figure(figsize=(12, 6))
plt.scatter(new.index, new['duration'], c=new['outage_prediction'], cmap='coolwarm', label='Anomalies')
plt.title('One-Class SVM Outage Detection')
plt.xlabel('Index')
plt.ylabel('Duration')
plt.legend()
plt.show()




In [None]:
new['outage_prediction'].value_counts()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt


labels = (model.predict(X_scaled) == -1).astype(int)  


X_train, X_test, y_train, y_test = train_test_split(X_scaled, labels, test_size=0.3, random_state=42)


logreg = LogisticRegression()
logreg.fit(X_train, y_train)


y_pred = logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label='Logistic Regression (AUC = {:.2f})'.format(roc_auc_score(y_test, y_prob)))
plt.plot([0, 1], [0, 1], 'k--')  
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

new=ivr.sample(n=500000, random_state=42)
X = new[[
    'dtl_4_num', 'duration', 'hour', 'category_AS', 'category_COL', 
    'category_OTH', 'category_RET', 'category_SLS', 'category_TS', 
    'product_C', 'product_D', 'product_H', 'product_T', 'product_U', 
    'product_W', 'lob_C', 'lob_R', 'lob_U', 'dtl_3_char_EN', 'dtl_3_char_TR', 
    'dtl_4_char_HangUp', 'dtl_4_char_TRANSFER', 'Category_Appointment & Installation', 
    'Category_Billing & Payments', 'Category_Promotions & Campaigns', 
    'Category_Sales & Retention', 'Category_Security & Abuse', 
    'Category_Service Changes', 'Category_Specialty Services', 
    'Category_Technical Support'
]]


iso_forest = IsolationForest(contamination=0.1, random_state=42)
outlier_labels = iso_forest.fit_predict(X)


inliers = X[outlier_labels == 1]
outliers = X[outlier_labels == -1]


plt.figure(figsize=(12, 6))


plt.subplot(1, 2, 1)
plt.scatter(outliers['hour'], outliers['duration'], c='orange', edgecolor='k', s=50)
plt.title(f"Outliers\n{len(outliers)} points")
plt.xlabel('Hour')
plt.ylabel('Duration')


plt.subplot(1, 2, 2)
plt.scatter(inliers['hour'], inliers['duration'], c='blue', edgecolor='k', s=50)
plt.title(f"Inliers\n{len(inliers)} points")
plt.xlabel('Hour')
plt.ylabel('Duration')

plt.suptitle("Outlier Method: Isolation Forest")
plt.tight_layout()
plt.show()


In [None]:
anomalies = X[outlier_labels == -1]


scaler = StandardScaler()
anomalies_scaled = scaler.fit_transform(anomalies)


kmeans = KMeans(n_clusters=3, random_state=42)  
anomaly_clusters = kmeans.fit_predict(anomalies_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(anomalies['hour'], anomalies['duration'], c=anomaly_clusters, cmap='viridis')
plt.xlabel('dtl_4_num')
plt.ylabel('duration')
plt.title('Clusters of Anomalies Detected by Isolation Forest')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
labels = (iso_forest.predict(X_scaled) == -1).astype(int)  


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


logreg = LogisticRegression()
logreg.fit(X_train, y_train)


y_pred = logreg.predict(X_test)
y_prob = logreg.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label='Logistic Regression (AUC = {:.2f})'.format(roc_auc_score(y_test, y_prob)))
plt.plot([0, 1], [0, 1], 'k--')  
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()