In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf

# Load the data from the CSV file
df = pd.read_csv("complaints.csv")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["issue"], test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

# Vectorize the training and testing data using TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a logistic regression model on the TF-IDF data
lr_tfidf = LogisticRegression()
lr_tfidf.fit(X_train_tfidf, y_train)
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)

# Print the accuracy and classification report for the logistic regression model
print("TF-IDF Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr_tfidf))
print(classification_report(y_test, y_pred_lr_tfidf))

# Create a Word2Vec model
w2v_model = Word2Vec(size=100, min_count=1)
w2v_model.build_vocab(X_train.str.split())
w2v_model.train(X_train.str.split(), total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)

# Vectorize the training and testing data using Word2Vec
X_train_w2v = pd.DataFrame([pd.Series(x).apply(lambda x: w2v_model.wv[x]) for x in X_train.str.split()])
X_test_w2v = pd.DataFrame([pd.Series(x).apply(lambda x: w2v_model.wv[x]) for x in X_test.str.split()])

# Train a random forest model on the Word2Vec data
rf_w2v = RandomForestClassifier()
rf_w2v.fit(X_train_w2v, y_train)
y_pred_rf_w2v = rf_w2v.predict(X_test_w2v)

# Print the accuracy and classification report for the random forest model
print("Word2Vec Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf_w2v))
print(classification_report(y_test, y_pred_rf_w2v))

# Create a DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

# Tokenize the training and testing data using DistilBERT
X_train_bert = tokenizer(X_train.tolist(), truncation=True, padding=True)
X_test_bert = tokenizer(X_test.tolist(), truncation=True, padding=True)

# Convert the tokenized data to TensorFlow tensors
X_train_bert = {key: tf.convert_to_tensor(val) for key, val in X_train_bert.items()}
X_test_bert = {key: tf.convert_to_tensor(val) for key, val in X_test_bert.items()}

# Pass the tokenized data through the DistilBERT model


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf

In [None]:
# Load the data from the CSV file
df = pd.read_csv("complaints.csv")

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["issue"], test_size=0.2, random_state=42)

In [None]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

In [None]:
# Vectorize the training and testing data using TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Train a logistic regression model on the TF-IDF data
lr_tfidf = LogisticRegression()
lr_tfidf.fit(X_train_tfidf, y_train)
y_pred_lr_tfidf = lr_tfidf.predict(X_test_tfidf)

In [None]:
# Print the accuracy and classification report for the logistic regression model
print("TF-IDF Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr_tfidf))
print(classification_report(y_test, y_pred_lr_tfidf))

In [None]:
# Create a Word2Vec model
w2v_model = Word2Vec(size=100, min_count=1)
w2v_model.build_vocab(X_train.str.split())
w2v_model.train(X_train.str.split(), total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)

In [None]:
# Vectorize the training and testing data using Word2Vec
X_train_w2v = pd.DataFrame([pd.Series(x).apply(lambda x: w2v_model.wv[x]) for x in X_train.str.split()])
X_test_w2v = pd.DataFrame([pd.Series(x).apply(lambda x: w2v_model.wv[x]) for x in X_test.str.split()])

In [None]:
# Train a random forest model on the Word2Vec data
rf_w2v = RandomForestClassifier()
rf_w2v.fit(X_train_w2v, y_train)
y_pred_rf_w2v = rf_w2v.predict(X_test_w2v)

In [None]:
# Print the accuracy and classification report for the random forest model
print("Word2Vec Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf_w2v))
print(classification_report(y_test, y_pred_rf_w2v))

In [None]:
# Create a DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
# Tokenize the training and testing data using DistilBERT
X_train_bert = tokenizer(X_train.tolist(), truncation=True, padding=True)
X_test_bert = tokenizer(X_test.tolist(), truncation=True, padding=True)

In [None]:
# Convert the tokenized data to TensorFlow tensors
X_train_bert = {key: tf.convert_to_tensor(val) for key, val in X_train_bert.items()}
X_test_bert = {key: tf.convert_to_tensor(val) for key, val in X_test_bert.items()}

In [None]:
# Pass the tokenized data through the DistilBERT model
X_train_bert = model(X_train_bert)[0][:, 0, :].numpy()
X_test_bert = model(X_test_bert)[0][:, 0, :].numpy()

In [None]:
#Train a linear SVM model on the DistilBERT data
svm_bert = LinearSVC()
svm_bert.fit(X_train_bert, y_train)
y_pred_svm_bert = svm_bert.predict(X_test_bert)

In [None]:
#Print the accuracy and classification report for the linear SVM model
print("DistilBERT Linear SVM Accuracy:", accuracy_score(y_test, y_pred_svm_bert))
print(classification_report(y_test, y_pred_svm_bert))

In [None]:
#Vectorize the training and testing data using a custom embedding layer
embedding_layer = tf.keras.layers.Embedding(input_dim=len(tfidf_vectorizer.vocabulary_), output_dim=100, input_length=max([len(x.split()) for x in X_train]), weights=[w2v_model.wv.vectors])
model = tf.keras.models.Sequential([
embedding_layer,
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(units=128, activation="relu"),
tf.keras.layers.Dense(units=64, activation="relu"),
tf.keras.layers.Dense(units=len(df["issue"].unique()), activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=10, batch_size=32)
y_pred_custom = model.predict(X_test)
y_pred_custom = [df["issue"].unique()[i] for i in y_pred_custom.argmax(axis=1)]

In [None]:
#Print the accuracy and classification report for the custom embedding layer model
print("Custom Embedding Layer Accuracy:", accuracy_score(y_test, y_pred_custom))
print(classification_report(y_test, y_pred_custom))

In [None]:
#This code creates four different machine learning models for predicting the issue based on consumer 
#complaint narratives using different vectorization methods. 

#First, the code uses a TF-IDF vectorizer to convert the training and testing data into TF-IDF vectors. 
#It then trains a logistic regression model on the TF-IDF data and prints the accuracy and classification report 
#for the model.

#Next, the code uses Word2Vec to create embeddings for the training and testing data. It then trains a random 
#forest model on the Word2Vec data and prints the accuracy and classification report for the model.

#The code also uses DistilBERT to tokenize the training and testing data and passes the tokenized data through 
#the model to obtain embeddings. It then trains a linear SVM model on the DistilBERT embeddings and prints 
#the accuracy and classification report for the model.

#Finally, the code uses a custom embedding layer in a TensorFlow model to convert the text data into embeddings. 
#It then trains the model on the training data and prints the accuracy and classification report for the model.