In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('./data/complaints.csv')

data.head()

In [None]:
data['Consumer complaint narrative'] = data['Consumer complaint narrative'].str.replace(r'\bX+[\sX]*\b', ' ', regex=True)

In [None]:
data['Issue_Code'] = data['Issue'].astype('category').cat.codes

In [None]:
data = data.rename(columns={'Consumer complaint narrative':'text'})

In [None]:
X = data[['text']]
y = data['Issue_Code']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

In [None]:
from collections import Counter
from scipy import sparse
from scipy.sparse.linalg import svds
import gensim

In [None]:
review_counter = Counter()

for review in X_train['text']:
    review_counter.update(gensim.utils.simple_preprocess(review))

word_index = {word: i for i, word in enumerate(review_counter.keys())}
index_word = {i: word for i, word in enumerate(review_counter.keys())}

window_size = 2

cooccurrence_counter = Counter()

for review in X_train['text']:
    # First, tokenize the sentence
    sentence = gensim.utils.simple_preprocess(review)
    
    # Then, we'll build the window around each word
    for i, word in enumerate(sentence):
        window = sentence[max(0, i-2): i] + sentence[i+1: i+3]

        # Then, we'll up the counter value for that pair
        for other_word in window:
            cooccurrence_counter[(word, other_word)] += 1
            

# for word in review_counter.keys():
#     cooccurrence_counter[(word, word)] += review_counter[word]

In [None]:
row_idx = []
col_idx = []
counts = []

for (word1, word2) in cooccurrence_counter.keys():
    row_idx.append(word_index[word1])
    col_idx.append(word_index[word2])
    counts.append(cooccurrence_counter[(word1, word2)])

cooccurrence_matrix = sparse.csc_matrix((counts, (row_idx, col_idx)), dtype = 'float')

dimension = 50

U, D, V = svds(cooccurrence_matrix, k = dimension)

word_vectors = U * D