In [1]:
import pandas as pd

In [87]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Initialize Faker
faker = Faker()

# Generate dummy data
num_records = 5000

# Profitability Dataset
profitability_data = pd.DataFrame({
    "temp_key": [faker.uuid4() for _ in range(num_records)],
    "total_expense_amt": np.random.uniform(1000, 50000, num_records),
    "total_credit_limit": np.random.uniform(500, 50000, num_records),
    "risk_fixed_bal_amt": np.random.uniform(0, 10000, num_records),
    "revenue_amortized_amt": np.random.uniform(0, 5000, num_records)
})

# Complaints Dataset
complaints_data = pd.DataFrame({
    "temp_key": [faker.uuid4() for _ in range(num_records)],
    "complaint_create_date": [faker.date_between('-1y', 'today') for _ in range(num_records)],
    "complaint_status": random.choices(["Open", "Closed", "Pending"], k=num_records),
    "agent_functional_group": random.choices(["Banking", "Loans", "Credit Cards"], k=num_records),
    "regulatory_classification_1": random.choices(["TCPA", "GDPR", "Other"], k=num_records),
    "Complaints": random.choices(["I am frustated", "I received multiple spam calls about services I did not request.", "robocalls"], k=num_records)
})

# Transactions Dataset
transactions_data = pd.DataFrame({
    "temp_key": [faker.uuid4() for _ in range(num_records)],
    "transaction_amount": np.random.uniform(1, 5000, num_records),
    "auth_amt": np.random.uniform(1, 5000, num_records),
    "transaction_type": random.choices(["Purchase", "Cash Advance", "Payment"], k=num_records),
    "foreign_currency_code": random.choices(["USD", "EUR", "INR"], k=num_records)
})

# Combine into one dataset
combined_data = pd.merge(profitability_data, complaints_data, on="temp_key", how="outer")
combined_data = pd.merge(combined_data, transactions_data, on="temp_key", how="outer")

# Save dataset
combined_data.to_csv("dummy_dataset.csv", index=False)
combined_data.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,,,,
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,,,,
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,,,,
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,,,,
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,,,,


In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from mlxtend.frequent_patterns import fpgrowth
from gensim.models import Word2Vec
from scipy.stats import entropy
from datetime import datetime
from faker import Faker

In [94]:
# Load dataset
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

df = pd.read_csv("dummy_dataset.csv")

# Check dataset structure
df.head()


Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,,,,
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,,,,
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,,,,
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,,,,
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,,,,


In [95]:
from gensim.models import Word2Vec

def create_word2vec_embeddings(df, categorical_columns):
    # Prepare data for Word2Vec
    sequences = df[categorical_columns].fillna("").values.tolist()
    model = Word2Vec(sequences, vector_size=10, window=3, min_count=1, sg=1, workers=4)
    # Generate embeddings for each row
    embeddings = np.array([np.mean([model.wv[word] for word in row if word in model.wv] or [np.zeros(10)], axis=0) for row in sequences])
    embedding_df = pd.DataFrame(embeddings, columns=[f"w2v_dim_{i}" for i in range(10)])
    return embedding_df

# Example usage
categorical_columns = ["agent_functional_group",'Complaints',  "regulatory_classification_1"]
w2v_features = create_word2vec_embeddings(df, categorical_columns)
df = pd.concat([df, w2v_features], axis=1)


In [96]:
w2v_features.head()

Unnamed: 0,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9
0,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134
1,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134
2,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134
3,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134
4,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134


In [91]:
df.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,,,,,-0.041525,0.456721,0.507937,0.4708,-0.348707,-0.618431,0.61509,0.714084,-0.765792,-0.186422
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,,,,,-0.041525,0.456721,0.507937,0.4708,-0.348707,-0.618431,0.61509,0.714084,-0.765792,-0.186422
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,,,,,-0.041525,0.456721,0.507937,0.4708,-0.348707,-0.618431,0.61509,0.714084,-0.765792,-0.186422
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,,,,,-0.041525,0.456721,0.507937,0.4708,-0.348707,-0.618431,0.61509,0.714084,-0.765792,-0.186422
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,,,,,-0.041525,0.456721,0.507937,0.4708,-0.348707,-0.618431,0.61509,0.714084,-0.765792,-0.186422


In [100]:
from textblob import TextBlob

def add_sentiment_features(df, text_columns):
    for col in text_columns:
        df[f"{col}_sentiment_polarity"] = df[col].fillna("").apply(lambda x: TextBlob(str(x)).sentiment.polarity)
        df[f"{col}_sentiment_subjectivity"] = df[col].fillna("").apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
    return df

# Example usage
text_columns = ["Complaints", "agent_functional_group"]
df = add_sentiment_features(df, text_columns)


In [102]:
df.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9,Complaints_sentiment_polarity,Complaints_sentiment_subjectivity,agent_functional_group_sentiment_polarity,agent_functional_group_sentiment_subjectivity
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0


In [103]:
def add_temporal_features(df, date_columns):
    for col in date_columns:
        df[f"{col}_year"] = pd.to_datetime(df[col]).dt.year
        df[f"{col}_month"] = pd.to_datetime(df[col]).dt.month
        df[f"{col}_day_diff"] = (datetime.now() - pd.to_datetime(df[col])).dt.days
    return df

# Example usage
date_columns = ["complaint_create_date"  ]
df = add_temporal_features(df, date_columns)


In [104]:
df.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9,Complaints_sentiment_polarity,Complaints_sentiment_subjectivity,agent_functional_group_sentiment_polarity,agent_functional_group_sentiment_subjectivity,complaint_create_date_year,complaint_create_date_month,complaint_create_date_day_diff
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,


In [None]:
from sklearn.cluster import DBSCAN
complaint_times = data['complaint_create_date'].values.reshape(-1, 1)
clustering = DBSCAN(eps=0.1, min_samples=3).fit(complaint_times)
data['complaint_time_cluster'] = clustering.labels_


In [105]:
def add_dbscan_features(df, numeric_columns):
    clustering = DBSCAN(eps=0.5, min_samples=5).fit(df[numeric_columns].fillna(0))
    df["dbscan_cluster"] = clustering.labels_
    return df

# Example usage
numeric_columns = ["transaction_amount", "auth_amt"]
df = add_dbscan_features(df, numeric_columns)


In [106]:
df.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9,Complaints_sentiment_polarity,Complaints_sentiment_subjectivity,agent_functional_group_sentiment_polarity,agent_functional_group_sentiment_subjectivity,complaint_create_date_year,complaint_create_date_month,complaint_create_date_day_diff,dbscan_cluster
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0


In [107]:
def add_aggregated_features(df, group_by_col, agg_columns):
    for col in agg_columns:
        df[f"{col}_mean"] = df.groupby(group_by_col)[col].transform("mean")
        df[f"{col}_std"] = df.groupby(group_by_col)[col].transform("std")
    return df

# Example usage
agg_columns = ["transaction_amount", "auth_amt"]
df = add_aggregated_features(df, "temp_key", agg_columns)


In [108]:
from sklearn.preprocessing import PolynomialFeatures

def add_polynomial_features(df, columns):
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    poly_features = pd.DataFrame(poly.fit_transform(df[columns].fillna(0)), columns=poly.get_feature_names_out(columns))
    return pd.concat([df, poly_features], axis=1)

# Example usage
columns = ["transaction_amount", "auth_amt"]
df = add_polynomial_features(df, columns)


In [109]:
df.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9,Complaints_sentiment_polarity,Complaints_sentiment_subjectivity,agent_functional_group_sentiment_polarity,agent_functional_group_sentiment_subjectivity,complaint_create_date_year,complaint_create_date_month,complaint_create_date_day_diff,dbscan_cluster,transaction_amount_mean,transaction_amount_std,auth_amt_mean,auth_amt_std,transaction_amount.1,auth_amt.1,transaction_amount auth_amt
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0


In [68]:
import networkx as nx

def create_interaction_graph(df, node1_col, node2_col, weight_col=None):
    """
    Creates a directed graph based on interactions between two columns.
    """
    G = nx.DiGraph()
    for _, row in df.iterrows():
        node1 = row[node1_col]
        node2 = row[node2_col]
        weight = row[weight_col] if weight_col else 1
        if G.has_edge(node1, node2):
            G[node1][node2]["weight"] += weight
        else:
            G.add_edge(node1, node2, weight=weight)
    return G

# Example usage: Customer-Agent interaction graph
#interaction_graph = create_interaction_graph(df, "temp_key", "agent_id", "transaction_amount")
interaction_graph = create_interaction_graph(df, "temp_key", "total_credit_limit", "transaction_amount")


In [69]:
from igraph import Graph
import pandas as pd

In [70]:
def create_igraph(df, node1_col, node2_col, weight_col=None):
    """
    Creates an igraph Graph from a pandas DataFrame.
    """
    edges = list(zip(df[node1_col], df[node2_col]))
    weights = df[weight_col].values if weight_col else None
    G = Graph.TupleList(edges, directed=True, edge_attrs={"weight": weights})
    return G

# Example: Create a graph from the dataset
G = create_igraph(df, "temp_key", "total_credit_limit", "transaction_amount")

In [71]:
def add_degree_centrality_igraph(G, df, node_col):
    """
    Adds degree centrality to the DataFrame by mapping it based on the node column.
    """
    # Get degree centrality for all nodes in the graph
    degrees = G.degree()
    nodes = G.vs["name"]  # Node names from the graph
    degree_dict = dict(zip(nodes, degrees))  # Map node names to degree values

    # Map the degree centrality to the DataFrame based on node_col
    df[f"{node_col}_degree_centrality"] = df[node_col].map(degree_dict).fillna(0)
    return df

# Example usage
G.vs["name"] = df["temp_key"].unique()  # Ensure graph nodes have names matching the DataFrame
df = add_degree_centrality_igraph(G, df, "temp_key")


In [72]:
df.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9,complaint_create_date_year,complaint_create_date_month,complaint_create_date_day_diff,dbscan_cluster,transaction_amount_mean,transaction_amount_std,auth_amt_mean,auth_amt_std,transaction_amount.1,auth_amt.1,transaction_amount auth_amt,temp_key_degree_centrality
0,c875fceb-807b-4c0e-8dc5-08ceb8f0e885,23351.717991,28017.981029,3645.730465,347.402524,,,,,,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1
1,e30d053b-da74-431c-a676-343f958bcbb9,10790.179318,27487.766369,7282.768042,2036.115113,,,,,,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1
2,0f93a1d7-c367-4a97-8e87-9e01aeea4447,10801.106407,4981.077272,485.491137,3804.789862,,,,,,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1
3,57c7d87b-97e7-44a4-b07c-a1eb8305a9bd,41240.229636,3554.338322,9679.110123,2743.042816,,,,,,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1
4,4e31b419-e07f-4fe1-bc65-34f9fd99009d,20868.426289,33174.654458,3993.221778,4977.855227,,,,,,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1


In [45]:
print(df.index.duplicated().sum())


0


In [77]:
df.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9,complaint_create_date_year,complaint_create_date_month,complaint_create_date_day_diff,dbscan_cluster,transaction_amount_mean,transaction_amount_std,auth_amt_mean,auth_amt_std,transaction_amount.1,auth_amt.1,transaction_amount auth_amt,temp_key_degree_centrality
0,c875fceb-807b-4c0e-8dc5-08ceb8f0e885,23351.717991,28017.981029,3645.730465,347.402524,,,,,0.0,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1
1,e30d053b-da74-431c-a676-343f958bcbb9,10790.179318,27487.766369,7282.768042,2036.115113,,,,,0.0,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1
2,0f93a1d7-c367-4a97-8e87-9e01aeea4447,10801.106407,4981.077272,485.491137,3804.789862,,,,,0.0,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1
3,57c7d87b-97e7-44a4-b07c-a1eb8305a9bd,41240.229636,3554.338322,9679.110123,2743.042816,,,,,0.0,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1
4,4e31b419-e07f-4fe1-bc65-34f9fd99009d,20868.426289,33174.654458,3993.221778,4977.855227,,,,,0.0,,,,0.142361,-0.155524,0.201783,0.415911,-0.310276,-0.065099,0.08582,0.106143,0.11398,-0.090547,,,,0,,,,,0.0,0.0,0.0,1


In [110]:
# Step 2: Convert numerical features to categorical (example: binning)
# Example: Binning 'total_expense_amt' into categories like 'Low', 'Medium', 'High'
df['expense_bin'] = pd.cut(df['total_expense_amt'], bins=[0, 150, 300, 500], labels=["Low", "Medium", "High"])
df['credit_limit_bin'] = pd.cut(df['total_credit_limit'], bins=[0, 6000, 8000, 10000], labels=["Low", "Medium", "High"])

# Step 3: One-hot encode categorical columns for FP-Growth
df_encoded = pd.get_dummies(df[['expense_bin', 'credit_limit_bin']], drop_first=True)

# Step 4: Apply FP-Growth to find frequent itemsets
frequent_itemsets = fpgrowth(df_encoded, min_support=0.5, use_colnames=True)

# Step 5: Create new features based on frequent itemsets
for idx, row in frequent_itemsets.iterrows():
    itemset = row['itemsets']
    feature_name = "_".join(sorted(itemset))
    df[feature_name] = df_encoded[list(itemset)].sum(axis=1)



In [111]:
df.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9,Complaints_sentiment_polarity,Complaints_sentiment_subjectivity,agent_functional_group_sentiment_polarity,agent_functional_group_sentiment_subjectivity,complaint_create_date_year,complaint_create_date_month,complaint_create_date_day_diff,dbscan_cluster,transaction_amount_mean,transaction_amount_std,auth_amt_mean,auth_amt_std,transaction_amount.1,auth_amt.1,transaction_amount auth_amt,expense_bin,credit_limit_bin
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,High
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,Low
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,,,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,


In [112]:
all_features = features + list(frequent_itemsets['itemsets'].apply(lambda x: "_".join(sorted(x))))
all_features

['total_expense_amt',
 'total_credit_limit',
 'risk_fixed_bal_amt',
 'revenue_amortized_amt',
 'transaction_amount',
 'auth_amt']

In [113]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import numpy as np

features = [
    "total_expense_amt",
    "total_credit_limit",
    "risk_fixed_bal_amt",
    "revenue_amortized_amt",
   
    "transaction_amount",
    "auth_amt"
]


# Step 1: Replace NaN values with 0 (in features subset)
df[features] = df[features].fillna(0)

# Step 2: Ensure there are no NaN values left
if df[features].isna().any().any():
    print("There are still NaN values in the dataset after replacement!")
else:
    print("NaN values successfully replaced.")

# Step 2: Apply Isolation Forest
def apply_isolation_forest(df, features, contamination=0.1, random_state=42):
    """
    Applies Isolation Forest to detect anomalies.
    :param df: DataFrame containing the dataset.
    :param features: List of features to use for anomaly detection.
    :param contamination: Proportion of anomalies in the dataset.
    :param random_state: Random state for reproducibility.
    :return: DataFrame with anomaly scores and flags.
    """
    # Initialize the Isolation Forest model
    model = IsolationForest(contamination=contamination, random_state=random_state)

    # Fit the model to the selected features
    model.fit(df[features])

    # Predict anomalies (-1 means anomaly, 1 means normal)
    df["anomaly_flag"] = model.predict(df[features])

    # Add anomaly scores to the DataFrame
    df["anomaly_score"] = model.decision_function(df[features])

    return df

# Step 3: Call the function
df = apply_isolation_forest(df, features)

# Display results
df.head()


NaN values successfully replaced.




Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9,Complaints_sentiment_polarity,Complaints_sentiment_subjectivity,agent_functional_group_sentiment_polarity,agent_functional_group_sentiment_subjectivity,complaint_create_date_year,complaint_create_date_month,complaint_create_date_day_diff,dbscan_cluster,transaction_amount_mean,transaction_amount_std,auth_amt_mean,auth_amt_std,transaction_amount.1,auth_amt.1,transaction_amount auth_amt,expense_bin,credit_limit_bin,anomaly_flag,anomaly_score
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,High,-1,-0.018972
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,Low,1,0.076468
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,,1,0.051002
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,,1,0.066024
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,,1,0.028098


In [None]:
data['complaint_create_date'] = pd.to_datetime(data['complaint_create_date'])
data['complaint_hour'] = data['complaint_create_date'].dt.hour
data['days_since_last_complaint'] = data.groupby('tempkey')['complaint_create_date'].diff().dt.days
data['complaint_cluster_week'] = data['complaint_create_date'].dt.isocalendar().week


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
complaint_embeddings = model.wv[data['complaint_description'].apply(lambda x: np.mean([model.wv[word] for word in x.split() if word in model.wv], axis=0))]
regulatory_embeddings = model.wv[data['regulatory_classification_1'].apply(lambda x: np.mean([model.wv[word] for word in x.split() if word in model.wv], axis=0))]
data['complaint_regulatory_similarity'] = cosine_similarity(complaint_embeddings, regulatory_embeddings)


In [None]:
from textblob import TextBlob
data['sentiment_score'] = data['complaint_description'].apply(lambda x: TextBlob(x).sentiment.polarity)
data['sentiment_variability'] = data.groupby('tempkey')['sentiment_score'].transform(lambda x: x.std())


In [117]:
df.head()

Unnamed: 0,temp_key,total_expense_amt,total_credit_limit,risk_fixed_bal_amt,revenue_amortized_amt,complaint_create_date,complaint_status,agent_functional_group,regulatory_classification_1,Complaints,transaction_amount,auth_amt,transaction_type,foreign_currency_code,w2v_dim_0,w2v_dim_1,w2v_dim_2,w2v_dim_3,w2v_dim_4,w2v_dim_5,w2v_dim_6,w2v_dim_7,w2v_dim_8,w2v_dim_9,Complaints_sentiment_polarity,Complaints_sentiment_subjectivity,agent_functional_group_sentiment_polarity,agent_functional_group_sentiment_subjectivity,complaint_create_date_year,complaint_create_date_month,complaint_create_date_day_diff,dbscan_cluster,transaction_amount_mean,transaction_amount_std,auth_amt_mean,auth_amt_std,transaction_amount.1,auth_amt.1,transaction_amount auth_amt,expense_bin,credit_limit_bin,anomaly_flag,anomaly_score
0,20f34cc7-8ef4-4d45-b018-049fd19a2f52,27240.025484,9533.059467,9996.949906,125.225157,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,High,-1,-0.018972
1,246adbe0-31a5-441a-9aad-7c20703e26df,15095.450235,5533.146488,263.910542,2089.62817,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,Low,1,0.076468
2,e17104ad-4b52-45ba-90fb-b578e4febd5c,32375.874035,37441.202863,1685.850721,2226.32022,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,,1,0.051002
3,3c077c54-0800-4c02-83e8-815e02baac4e,24986.502008,27757.654583,2138.210208,414.190724,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,,1,0.066024
4,85d21bb1-8c52-4b53-95a1-bef8f6e8b989,14471.942746,35826.253352,1200.230508,3988.820115,,,,,,0.0,0.0,,,0.175093,-0.166008,0.224333,0.463473,-0.3175,-0.076551,0.091725,0.086776,0.112885,-0.136134,0.0,0.0,0.0,0.0,,,,0,,,,,0.0,0.0,0.0,,,1,0.028098


In [120]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from mlxtend.preprocessing import TransactionEncoder
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Example DataFrame (Dummy Dataset)
data = {
    "customer_complaint": [
        "I keep getting spam calls even after opting out.",
        "Your team calls me too frequently.",
        "No issues here.",
        "I received multiple robocalls despite opting out.",
        "Too many frequent calls about services I never requested."
    ]
}

df = pd.DataFrame(data)

# Step 1: Preprocess complaints (Tokenization, Stopword Removal, Lowercasing)
stop_words = set(stopwords.words('english'))  # Load English stopwords
def preprocess_text(text):
    # Tokenize and remove stopwords, punctuation
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

df['processed_complaint'] = df['customer_complaint'].apply(preprocess_text)

# Step 2: Convert processed complaints into transactions
transactions = df['processed_complaint'].tolist()

# Step 3: Convert transactions into one-hot encoded DataFrame
te = TransactionEncoder()
te_data = te.fit(transactions).transform(transactions)
transaction_df = pd.DataFrame(te_data, columns=te.columns_)

# Step 4: Apply FP-Growth
frequent_itemsets = fpgrowth(transaction_df, min_support=0.5, use_colnames=True)

# Step 5: Add features to the original DataFrame
for idx, row in frequent_itemsets.iterrows():
    itemset = list(row['itemsets'])
    feature_name = "_".join(itemset)  # Create feature name by combining items
    # Generate the feature: Presence of itemset in each row
    df[feature_name] = transaction_df[itemset].all(axis=1).astype(int)

# Display the DataFrame with new features
print("Original Data with FP-Growth-based Features:\n")
print(df)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original Data with FP-Growth-based Features:

                                  customer_complaint                                processed_complaint  calls
0   I keep getting spam calls even after opting out.         [keep, getting, spam, calls, even, opting]      1
1                 Your team calls me too frequently.                          [team, calls, frequently]      1
2                                    No issues here.                                           [issues]      0
3  I received multiple robocalls despite opting out.   [received, multiple, robocalls, despite, opting]      0
4  Too many frequent calls about services I never...  [many, frequent, calls, services, never, reque...      1


In [119]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [121]:
import pandas as pd
import re
from mlxtend.frequent_patterns import fpgrowth, apriori
from mlxtend.preprocessing import TransactionEncoder

# Sample complaint data (replace with your dataset)
data = {'complaint_description': ["I received an unsolicited call about a new credit card offer.",
                                  "The automated call was about marketing, and I did not opt in.",
                                  "I got repeated robocalls from a telemarketer.",
                                  "They called me despite being on the do-not-call list."]}
df = pd.DataFrame(data)

# List of 200 TCPA-related keywords (you can use the previous list)
tcp_keywords = [
    "robocall", "telemarketing", "automated call", "unsolicited call", "do-not-call", "spam", 
    "call blocking", "customer consent", "marketing call", "opt-out", "telemarketing list", "call center", 
    "caller ID", "illegal calls", "call volume", "call recording", "repeated calls", "call tracking", 
    "telephone solicitation", "unwanted message", "ringless voicemail", "artificial voice", "pre-recorded message", 
    "contact center", "privacy violation", "unwanted solicitation", "repeated messages", "text messages", "text spam", 
    "telemarketing campaign", "opt-in", "robocall violation", "tcpa fine", "telemarketing regulations", "consumer rights", 
    "marketing database", "cell phone spam", "call harassment", "do-not-call list violation", "tcpa compliance", 
    "call harassment", "call recording laws", "call abandonment", "message spam", "legal call", "outbound call", 
    "automated system", "no-call list", "consumer protection", "pre-recorded voicemail", "ringless voicemail campaign", 
    "spam calls", "voice message spam", "violent call behavior", "do-not-call registry", "prerecorded marketing", 
    "inbound marketing", "call interruption", "commercial call", "cold call", "legal obligations", "communication consent", 
    "call frequency", "consumer complaint", "call tracking tool", "caller harassment", "junk text", "unsolicited fax", 
    "caller script", "telemarketing harassment", "predictive dialing", "call suppression", "marketing compliance", 
    "unsolicited marketing", "call monitoring", "contact violation", "calling system", "number portability", 
    "spam filtering", "do-not-call registry list", "privacy breach", "phone bill scams", "voice message interception", 
    "consumer notification", "repetitive dialing", "contact management", "spam report", "marketing automation", 
    "data protection", "telemarketing sales", "scam calls", "consumer fraud", "do-not-disturb violation", "marketing policy", 
    "call security", "privacy law", "tcpa complaint", "regulatory compliance", "marketing scam", "call automation", 
    "harassing calls", "privacy rights violation", "telemarketing text", "recorded message", "outbound telemarketer", 
    "automated outbound call", "call center monitoring", "call scheduling", "sales calls", "voice spam", 
    "do-not-call protection", "telemarketing fraud", "tcpa penalty", "call harassment regulation", "marketing laws", 
    "phone marketing", "predictive dialing system", "data security", "consumer privacy", "autodialing system", 
    "call pattern analysis", "spam filtering system", "no-call list violation", "no-consent marketing", "robocall blocker", 
    "voice call regulation", "automated dialing system", "internet-based calling", "sms spam", "phone scam", 
    "blocking robocalls", "tcpa lawsuit", "call not answered", "messaging system", "calling patterns", 
    "automated message delivery", "consent violation", "caller information", "fraudulent calls", "digital marketing", 
    "call center regulation", "unsolicited advertisement", "contact information", "call overload", "consumer alerts", 
    "phone solicitation", "outbound marketing", "consumer complaint resolution", "marketing strategy", "call interruption", 
    "call rejection", "data privacy", "marketing guidelines", "call recording compliance", "tcpa fine structure", 
    "no-contact policy", "invalid call", "caller database", "digital solicitation", "marketing violations", "voice-based marketing", 
    "regulatory violation", "compliance violations", "call rights", "repeated messages", "illegal message", 
    "unsolicited marketing text", "automated marketing", "robocall legislation", "violating calls", "phone-based scams", 
    "blocking unsolicited calls", "marketing fraud", "automated telemarketing", "outbound dialing", "call duration", 
    "unsolicited communication", "telemarketing disclosure", "do-not-call regulation", "compliance tracking", "repeat solicitation", 
    "telecommunication violation", "persistent solicitation", "regulatory fines", "phone solicitation compliance", "spam messaging", 
    "call violations", "automated sales calls", "voicemail spam", "call misrepresentation", "robocall prevention", 
    "telemarketer harassment", "message misdirection", "non-consent solicitation", "message overload", "phone solicitation law", 
    "text communication harassment", "call center regulation compliance", "marketing excess", "automated marketing violation"
]

# Function to extract TCPA-related keywords from complaints
def find_tcp_keywords(complaint, keywords):
    complaint_lower = complaint.lower()
    matches = [keyword for keyword in keywords if re.search(r'\b' + re.escape(keyword) + r'\b', complaint_lower)]
    return matches

# Apply function to extract keywords from complaints
df['tcp_keywords_found'] = df['complaint_description'].apply(lambda x: find_tcp_keywords(x, tcp_keywords))

# Prepare the data for FPGrowth
# Convert complaints into transactions where each complaint is a set of TCPA-related keywords
transactions = df['tcp_keywords_found'].apply(lambda x: x if x else None).dropna().tolist()

# Transaction Encoding for FPGrowth
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_transactions = pd.DataFrame(te_ary, columns=te.columns_)

# Apply FPGrowth to find frequent itemsets
frequent_itemsets = fpgrowth(df_transactions, min_support=0.1, use_colnames=True)

# Display frequent itemsets
print(frequent_itemsets)


    support            itemsets
0  0.333333  (unsolicited call)
1  0.333333    (automated call)
2  0.333333       (do-not-call)


In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Tokenize the complaint descriptions
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['complaint_description'])
X = tokenizer.texts_to_sequences(data['complaint_description'])
X_pad = pad_sequences(X, maxlen=100)

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=64, input_length=100))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
model.fit(X_pad, labels, epochs=10, batch_size=64)

# Anomaly detection (predictions) on complaint data
predictions = model.predict(X_pad)
data['lstm_anomaly'] = predictions > 0.5


In [122]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder
from sklearn.ensemble import IsolationForest
import pandas as pd

# Sample customer complaints data
data = {'complaint_description': ["I received an unsolicited call about a new credit card offer.",
                                  "The automated call was about marketing, and I did not opt in.",
                                  "I got repeated robocalls from a telemarketer.",
                                  "They called me despite being on the do-not-call list."]}
df = pd.DataFrame(data)

# List of TCPA-related keywords
tcp_keywords = [
    "robocall", "telemarketing", "automated call", "unsolicited call", "do-not-call", "spam", 
    "call blocking", "customer consent", "marketing call", "opt-out", "telemarketing list", "call center", 
    "caller ID", "illegal calls", "call volume", "call recording", "repeated calls", "call tracking", 
    "telephone solicitation", "unwanted message", "ringless voicemail", "artificial voice", "pre-recorded message", 
    "contact center", "privacy violation", "unwanted solicitation", "repeated messages", "text messages", "text spam", 
    "telemarketing campaign", "opt-in", "robocall violation", "tcpa fine", "telemarketing regulations", "consumer rights", 
    "marketing database", "cell phone spam", "call harassment", "do-not-call list violation", "tcpa compliance", 
    "call harassment", "call recording laws", "call abandonment", "message spam", "legal call", "outbound call", 
    "automated system", "no-call list", "consumer protection", "pre-recorded voicemail", "ringless voicemail campaign", 
    "spam calls", "voice message spam", "violent call behavior", "do-not-call registry", "prerecorded marketing", 
    "inbound marketing", "call interruption", "commercial call", "cold call", "legal obligations", "communication consent", 
    "call frequency", "consumer complaint", "call tracking tool", "caller harassment", "junk text", "unsolicited fax", 
    "caller script", "telemarketing harassment", "predictive dialing", "call suppression", "marketing compliance", 
    "unsolicited marketing", "call monitoring", "contact violation", "calling system", "number portability", 
    "spam filtering", "do-not-call registry list", "privacy breach", "phone bill scams", "voice message interception", 
    "consumer notification", "repetitive dialing", "contact management", "spam report", "marketing automation", 
    "data protection", "telemarketing sales", "scam calls", "consumer fraud", "do-not-disturb violation", "marketing policy", 
    "call security", "privacy law", "tcpa complaint", "regulatory compliance", "marketing scam", "call automation", 
    "harassing calls", "privacy rights violation", "telemarketing text", "recorded message", "outbound telemarketer", 
    "automated outbound call", "call center monitoring", "call scheduling", "sales calls", "voice spam", 
    "do-not-call protection", "telemarketing fraud", "tcpa penalty", "call harassment regulation", "marketing laws", 
    "phone marketing", "predictive dialing system", "data security", "consumer privacy", "autodialing system", 
    "call pattern analysis", "spam filtering system", "no-call list violation", "no-consent marketing", "robocall blocker", 
    "voice call regulation", "automated dialing system", "internet-based calling", "sms spam", "phone scam", 
    "blocking robocalls", "tcpa lawsuit", "call not answered", "messaging system", "calling patterns", 
    "automated message delivery", "consent violation", "caller information", "fraudulent calls", "digital marketing", 
    "call center regulation", "unsolicited advertisement", "contact information", "call overload", "consumer alerts", 
    "phone solicitation", "outbound marketing", "consumer complaint resolution", "marketing strategy", "call interruption", 
    "call rejection", "data privacy", "marketing guidelines", "call recording compliance", "tcpa fine structure", 
    "no-contact policy", "invalid call", "caller database", "digital solicitation", "marketing violations", "voice-based marketing", 
    "regulatory violation", "compliance violations", "call rights", "repeated messages", "illegal message", 
    "unsolicited marketing text", "automated marketing", "robocall legislation", "violating calls", "phone-based scams", 
    "blocking unsolicited calls", "marketing fraud", "automated telemarketing", "outbound dialing", "call duration", 
    "unsolicited communication", "telemarketing disclosure", "do-not-call regulation", "compliance tracking", "repeat solicitation", 
    "telecommunication violation", "persistent solicitation", "regulatory fines", "phone solicitation compliance", "spam messaging", 
    "call violations", "automated sales calls", "voicemail spam", "call misrepresentation", "robocall prevention", 
    "telemarketer harassment", "message misdirection", "non-consent solicitation", "message overload", "phone solicitation law", 
    "text communication harassment", "call center regulation compliance", "marketing excess", "automated marketing violation"
]

# Function to extract TCPA-related keywords from complaints
def find_tcp_keywords(complaint, keywords):
    complaint_lower = complaint.lower()
    matches = [keyword for keyword in keywords if re.search(r'\b' + re.escape(keyword) + r'\b', complaint_lower)]
    return matches

# Apply function to extract keywords from complaints
df['tcp_keywords_found'] = df['complaint_description'].apply(lambda x: find_tcp_keywords(x, tcp_keywords))

# Prepare the data for FPGrowth
transactions = df['tcp_keywords_found'].apply(lambda x: x if x else None).dropna().tolist()

# Transaction Encoding for FPGrowth
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_transactions = pd.DataFrame(te_ary, columns=te.columns_)

# Apply FPGrowth to find frequent itemsets
frequent_itemsets = fpgrowth(df_transactions, min_support=0.1, use_colnames=True)

# Extract itemsets for use as features
frequent_itemsets_list = frequent_itemsets['itemsets'].apply(lambda x: list(x)).tolist()

# Create a feature matrix based on frequent itemsets
def create_feature_matrix(df, frequent_itemsets_list, te_columns):
    # Initialize binary feature columns for frequent itemsets
    for idx, itemset in enumerate(frequent_itemsets_list):
        feature_name = f"pattern_{idx}"
        df[feature_name] = df['tcp_keywords_found'].apply(lambda x: 1 if any(keyword in x for keyword in itemset) else 0)
    return df

# Create the feature matrix
df = create_feature_matrix(df, frequent_itemsets_list, te.columns_)

# Now, df contains the binary features for the frequent itemsets
df_features = df.drop(columns=['complaint_description', 'tcp_keywords_found'])

print(df_features)


   pattern_0  pattern_1  pattern_2
0          1          0          0
1          0          1          0
2          0          0          0
3          0          0          1


In [123]:
# Install the pandas_profiling library if not already installed
# !pip install pandas-profiling

import pandas as pd
from pandas_profiling import ProfileReport

# Assuming your dataframe is 'df', which contains both int and categorical variables

# Example: Creating a sample dataframe (Replace with your actual dataframe)
data = {
    'Complaint': ['Robocall', 'Spam Call', 'Do Not Call', 'Repeated Calls', 'Unsolicited'],
    'Complaint_Length': [100, 80, 120, 200, 150],  # Numerical Feature
    'Call_Type': ['Robocall', 'Manual', 'Robocall', 'Spam', 'Robocall'],  # Categorical Feature
    'Customer_Rating': [1, 2, 1, 3, 2],  # Numerical Feature
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)

# Create the ProfileReport for the DataFrame
profile = ProfileReport(df, title="TCPA Violation Complaint Profiling", explorative=True)

# Save the report to an HTML file
profile.to_file("tcp_violations_complaint_report.html")

# To view the report inline (in Jupyter Notebook or IPython)
profile.to_notebook_iframe()


ModuleNotFoundError: No module named 'pandas_profiling'