In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
import torch

In [3]:
from sklearn.decomposition import PCA
import umap

In [4]:
df = pd.read_csv(
    "/home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/processed/processed_reviews.csv"
)

In [5]:
df.head()

Unnamed: 0,rating,review_length_text,review_length_title,polarity_text,polarity_title,subjectivity_text,subjectivity_title,lemmatized_text,lemmatized_title,day_of_week,month,year
0,1,408,2,0.018448,-1.0,0.512241,1.0,"['travel', 'lot', 'travel', 'often', 'last', '...","['bad', 'airline']",Wednesday,11,2024
1,1,157,4,-0.060897,-1.0,0.592949,1.0,"['review', 'regard', 'flight', 'af', 'book', '...","['terrible', 'experience', 'airfrance']",Wednesday,11,2024
2,1,259,11,-0.094163,-0.6,0.488287,0.7,"['recently', 'fly', 'air', 'france', 'flight',...","['extremely', 'disappointing', 'experience', '...",Tuesday,11,2024
3,1,274,1,-0.109373,-1.0,0.494012,1.0,"['wow', 'horrible', 'experience', 'I', 've', '...",['horrible'],Monday,11,2024
4,1,311,7,-0.126476,-1.0,0.485192,1.0,"['spend', 'fantastic', 'day', 'vacation', 'hon...","['bad', 'flight', 'experience', 'I', 've', 'ev...",Monday,11,2024


In [6]:
bow_vectorizer = CountVectorizer(max_features=10000, min_df=5, stop_words="english")
X_bow = bow_vectorizer.fit_transform(df["lemmatized_text"].astype(str))

df_bow = pd.DataFrame(X_bow.toarray(), columns=bow_vectorizer.get_feature_names_out())

with open('/home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/interim/bow_vectorized.pkl', 'wb') as f:
    pickle.dump(df_bow, f)

df_bow.head()

Unnamed: 0,ability,able,aboard,absolute,absolutely,ac,accent,accept,acceptable,access,...,yes,yesterday,yogurt,york,young,yr,yvr,zero,zone,zurich
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000, 
    stop_words="english", 
    ngram_range=(1,2), 
    min_df=5, 
)

# Fit and transform lemmatized text
X_tfidf = tfidf_vectorizer.fit_transform(df['lemmatized_text'].astype(str))
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Save TF-IDF matrix
with open('/home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/interim/tfidf_vectorized.pkl', 'wb') as f:
    pickle.dump(df_tfidf, f)


df_tfidf.head()

Unnamed: 0,ability,able,able check,able fly,able sleep,able stretch,aboard,absolute,absolutely,ac,...,yesterday,yogurt,york,young,yr,yr old,yvr,zero,zone,zurich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
tokenized_reviews = [text.split() for text in df["lemmatized_text"].astype(str)]

word2vec_model = Word2Vec(
    sentences=tokenized_reviews,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4
)

def get_word2vec_embeddings(text):
    words = text.split()
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

df['word2vec_embedding'] = df['lemmatized_text'].apply(get_word2vec_embeddings)

# Convert embeddings to a list (needed for pickle)
word2vec_embeddings = df['word2vec_embedding'].tolist()

# Save Word2Vec embeddings
word2vec_path = '/home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/interim/word2vec_vectorized.pkl'
with open(word2vec_path, 'wb') as f:
    pickle.dump(word2vec_embeddings, f)

print(f"Word2Vec embeddings successfully saved to {word2vec_path}")

df[['word2vec_embedding']].head()

Word2Vec embeddings successfully saved to /home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/interim/word2vec_vectorized.pkl


Unnamed: 0,word2vec_embedding
0,"[-0.26565692, 0.15300937, 0.023394726, -0.0995..."
1,"[-0.14890291, 0.28530204, 0.0012725089, -0.055..."
2,"[-0.1506756, 0.27368864, 0.015880648, 0.044413..."
3,"[-0.1706126, 0.2824239, 0.013168015, -0.061199..."
4,"[-0.17064783, 0.16075645, 0.0037159673, -0.000..."


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

def get_bert_embeddings(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        output = bert_model(**tokens)
    return output.last_hidden_state[:,0,:].cpu().numpy().flatten()

df['bert_embedding'] = df['lemmatized_text'].apply(get_bert_embeddings)

with open('/home/azureuser/cloudfiles/code/Users/oskar.wolf/nlp_air_france_reviews/data/interim/bert_vectorized.pkl', 'wb') as f:
    pickle.dump(df['bert_embedding'].tolist(), f)
    
print("BERT embeddings successfully saved to bert_vectorized.pkl")

df[['bert_embedding']].head()

BERT embeddings successfully saved to bert_vectorized.pkl


Unnamed: 0,bert_embedding
0,"[-0.059372894, 0.87842745, 0.42169085, 0.00433..."
1,"[-0.22069171, 0.65713084, 0.33688197, -0.20024..."
2,"[-0.28149357, 0.7224497, 0.23585524, -0.108056..."
3,"[-0.10413745, 0.7803177, 0.3849645, -0.0113108..."
4,"[-0.17919672, 0.6800731, 0.50333893, -0.045008..."
