In [None]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
stopwords = set(stopwords.words('english'))

In [None]:
df=pd.read_csv('EcomReviews_8k.csv')

In [None]:
df.columns

In [None]:
df.sample(1)

In [None]:
df['labels'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
stemmer    = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    n=3
    sep=' '
    text = text.lower() # Lower case
    tokens = re.findall(r'\w+', text) # Extract tokens using regex
#     tokens =  word_tokenize(text) # Extract tokens using nltk
#     tokens = [ lemmatizer.lemmatize(word) for word in tokens] # Lammatization
#     tokens = [stemmer.stem(word) for word in tokens] # Stemming
    tokens = [sep.join(ngram) for ngram in zip(*[tokens[i:] for i in range(n)]) if len([t for t in ngram if t in stopwords])==0]   
    return tokens

In [None]:
df['tokens'] = df['text'].apply(lambda x: preprocess(x))

In [None]:
# Clean data using the built in cleaner in gensim
df['text_clean'] = df['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
df.head()

In [None]:
# Encoding the label column
df['labels']=df['labels'].replace({'__label__2':1,'__label__1':0})
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text_clean'], df['labels'] , test_size=0.2)

In [None]:
df['labels']

In [None]:
# Train the word2vec model from scratch
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2,sg=0)#cbow

In [None]:
len(w2v_model.wv.index_to_key)
## Accessing the index_to_key attribute

In [None]:
# w2v_model.wv['light']

In [None]:
words = set(w2v_model.wv.index_to_key )
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming X_train and X_test are lists of sequences of words

# Convert words to word vectors and pad sequences
X_train_vect = pad_sequences([[w2v_model.wv[i] for i in ls if i in words] for ls in X_train], dtype='float32', padding='post')
X_test_vect = pad_sequences([[w2v_model.wv[i] for i in ls if i in words] for ls in X_test], dtype='float32', padding='post')


In [None]:
X_test_vect[0].shape

In [None]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

If you're using word embeddings like Word2Vec or GloVe, each word in the sentence is typically represented by a fixed-length vector. When these vectors are combined to represent a sentence, they might be aggregated in various ways (e.g., averaging, summing, or concatenating), resulting in a single vector representation for the entire sentence. Consequently, the length of the sentence vector will not be the same as the number of words in the sentence.

In [None]:
for i, (sentence, vector) in enumerate(zip(X_train, X_train_vect)):
    print("Original Sentence:", X_train.iloc[i])
    print("Sentence Vector:", vector)
    print("Lengths:", len(X_train.iloc[i]), len(vector))
#relationship between the original sentences and their vector representations.

In [None]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

##### computes sentence vectors by averaging the word vectors for the words contained in each sentence. 
- X_train_vect_avg and X_test_vect_avg are initialized as empty lists to store the averaged sentence vectors for the training and test datasets, respectively.
- The code iterates over each sentence vector in X_train_vect and X_test_vect. For each sentence vector v, it checks if v is not empty (i.e., it contains at least one word vector). If v is not empty, it computes the mean (average) of the word vectors along the first axis (axis 0), which corresponds to averaging the word vectors for each dimension. The resulting mean vector represents the averaged sentence vector for the current sentence. If v is empty (i.e., all word vectors are zero vectors), it appends a zero vector of the same dimensionality (100 in this case) to the list as a placeholder.
- After iterating over all sentence vectors, X_train_vect_avg and X_test_vect_avg contain the averaged sentence vectors for the training and test datasets, respectively.
- This approach of averaging word vectors to obtain sentence vectors is a common technique in natural language processing (NLP) tasks. It allows you to capture the overall semantic meaning of a sentence based on the meanings of its constituent words.

- The np.zeros(100, dtype=float) part in the code creates a zero vector of length 100, which matches the dimensionality of the word vectors. This zero vector is used as a placeholder for sentences with no words or out-of-vocabulary words, ensuring that all sentence vectors have the same dimensionality.

In [None]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

In [None]:
for i, (sentence, vector) in enumerate(zip(X_train, X_train_vect_avg)):
    print("Original Sentence Length:", len(sentence))
    print("Averaged Sentence Vector Length:", len(vector))


In [None]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(max_iter=400)                                                 
regressor = regressor.fit(X_train_vect_avg, y_train.values)

In [None]:
y_train.values

In [None]:
# Use the trained model to make predictions on the test data
y_pred = regressor.predict(X_test_vect_avg)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [None]:
y_test

In [None]:
w2v_model.wv.doesnt_match(['breakfast','cereal','dinner','lunch'])

In [None]:
w2v_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=10)

### Glove

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('glove.txt', binary=False,no_header=True)

In [None]:
model.get_vector('light')

In [None]:
len(model.index_to_key)

In [None]:
words = set(model.index_to_key )
#For each sequence of words in X_train, the list comprehension [model[i] for i in ls if i in words] iterates through each word (i) in the sequence ls and retrieves its word vector using model[i].
X_train_vect = pad_sequences([[model[i] for i in ls if i in words] for ls in X_train], dtype='float32', padding='post')
X_test_vect = pad_sequences([[model[i] for i in ls if i in words] for ls in X_test], dtype='float32', padding='post')

In [None]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(max_iter=10000)                               
regressor = regressor.fit(X_train_vect_avg, y_train.values)

In [None]:
# Use the trained model to make predictions on the test data
y_pred = regressor.predict(X_test_vect_avg)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

#### word2vec

In [None]:
file_path = "GoogleNews-vectors-negative300.bin"

# Load into gensim
w2vec = gensim.models.KeyedVectors.load_word2vec_format(file_path, binary=True)

In [None]:
words = set(w2vec.index_to_key )
#For each sequence of words in X_train, the list comprehension [model[i] for i in ls if i in words] iterates through each word (i) in the sequence ls and retrieves its word vector using model[i].
X_train_vect = pad_sequences([[w2vec[i] for i in ls if i in words] for ls in X_train], dtype='float32', padding='post')
X_test_vect = pad_sequences([[w2vec[i] for i in ls if i in words] for ls in X_test], dtype='float32', padding='post')

In [None]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(max_iter=10000)                               
regressor = regressor.fit(X_train_vect_avg, y_train.values)

In [None]:
# Use the trained model to make predictions on the test data
y_pred = regressor.predict(X_test_vect_avg)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))