#### Import all dependencies

In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/suonieo1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/suonieo1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Read the data from the file. Rename the columns to "Text" and "Sentiment".

In [97]:
df = pd.read_csv('data.csv', delimiter=',', encoding='latin-1', header=None)
df = df.rename(columns=lambda x: ['Sentiment', 'Text'][x])
df = df[['Text', 'Sentiment']]

#### Create the preprocessing function. Here we tokenize the text, remove all punctuations and convert them to lowercase. Then we remove the stopwords and stem the words.
#### Finally we join the words back into a single string and return the processed text.

In [98]:
def preprocess_text(text):
    # Tokenization (split the text into words)
    words = nltk.word_tokenize(text)
    
    # Remove punctuation and convert to lowercase
    words = [word.lower() for word in words if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Stemming (reducing words to their root form)
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    # Join the cleaned words back into a single string
    cleaned_text = ' '.join(words)
    
    return cleaned_text

#### We apply the preprocessing to each text in the data, so the 'Text' column contains preprocessed text

In [99]:
df['Text'] = df['Text'].apply(preprocess_text)
print(df.head())

                                                Text Sentiment
0  accord gran compani plan move product russia a...   neutral
1  technopoli plan develop stage area less squar ...   neutral
2  intern electron industri compani elcoteq laid ...  negative
3  new product plant compani would increas capac ...  positive
4  accord compani updat strategi year baswar targ...  positive


#### We split the data into training, validation and test sets.
#### Then we further split the training data into training and validation sets using k-fold cross-validation.
#### We are using method StratifiedKFold over the standard KFold. StratifiedKFold is often preferred over KFold in classification tasks, especially when you have imbalanced class distributions.

In [100]:
X = df['Text']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=42)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)


#### Then we extract features using feature engineering.

#### First we initialize the TfidfVectorizer, Bag of Words (BoW) and Word2Vec vectorizers

In [101]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # TF-IDF with 10,000 features
bow_vectorizer = CountVectorizer(max_features=10000)    # BoW with 10,000 features
word2vec_model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, sg=0)

#### The we create the lists to store the Tfidf, BoW and the Word2Vec vectors

In [102]:
X_train_tfidf = []
X_val_tfidf = []
X_test_tfidf = []
X_train_bow = []
X_val_bow = []
X_test_bow = []
X_train_w2v = []
X_val_w2v = []
X_test_w2v = []

#### Then we train

In [103]:
for train_index, val_index in kfold.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Apply Tfidf vectorization
    X_train_tfidf_fold = tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf_fold = tfidf_vectorizer.transform(X_val)
    X_test_tfidf_fold = tfidf_vectorizer.transform(X_test)

    X_train_tfidf.append(X_train_tfidf_fold)
    X_val_tfidf.append(X_val_tfidf_fold)
    X_test_tfidf.append(X_test_tfidf_fold)

    # Apply BoW vectorization
    X_train_bow_fold = bow_vectorizer.fit_transform(X_train)
    X_val_bow_fold = bow_vectorizer.transform(X_val)
    X_test_bow_fold = bow_vectorizer.transform(X_test)

    X_train_bow.append(X_train_bow_fold)
    X_val_bow.append(X_val_bow_fold)
    X_test_bow.append(X_test_bow_fold)

    # Initialize Word2Vec vectors
    X_train_w2v_fold = []
    X_val_w2v_fold = []
    X_test_w2v_fold = []

    for word in X_train:
        if word in word2vec_model.wv:
            X_train_w2v_fold.append(word2vec_model.wv[word])
        else:
            X_train_w2v_fold.append([0.0] * word2vec_model.vector_size)

    for word in X_val:
        if word in word2vec_model.wv:
            X_val_w2v_fold.append(word2vec_model.wv[word])
        else:
            X_val_w2v_fold.append([0.0] * word2vec_model.vector_size)

    for word in X_test:
        if word in word2vec_model.wv:
            X_test_w2v_fold.append(word2vec_model.wv[word])
        else:
            X_test_w2v_fold.append([0.0] * word2vec_model.vector_size)

    X_train_w2v.append(X_train_w2v_fold)
    X_val_w2v.append(X_val_w2v_fold)
    X_test_w2v.append(X_test_w2v_fold)

In [104]:
# Train and evaluate models for Tfidf, BoW, and Word2Vec vectors
for i in range(10):
    # Train and evaluate a model for Tfidf
    lr_model_tfidf = LogisticRegression()
    lr_model_tfidf.fit(X_train_tfidf[i], y_train)
    y_val_pred_lr_tfidf = lr_model_tfidf.predict(X_val_tfidf[i])
    y_test_pred_lr_tfidf = lr_model_tfidf.predict(X_test_tfidf[i])

    # Train and evaluate a model for BoW
    lr_model_bow = LogisticRegression()
    lr_model_bow.fit(X_train_bow[i], y_train)
    y_val_pred_lr_bow = lr_model_bow.predict(X_val_bow[i])
    y_test_pred_lr_bow = lr_model_bow.predict(X_test_bow[i])

    # Train and evaluate a model for Word2Vec
    lr_model_w2v = LogisticRegression()
    lr_model_w2v.fit(X_train_w2v[i], y_train)
    y_val_pred_lr_w2v = lr_model_w2v.predict(X_val_w2v[i])
    y_test_pred_lr_w2v = lr_model_w2v.predict(X_test_w2v[i])

    # Print evaluation metrics for Tfidf, BoW, and Word2Vec
    print(f"Fold {i+1} - Tfidf Validation Results:")
    print("Accuracy:", accuracy_score(y_val, y_val_pred_lr_tfidf))
    print(classification_report(y_val, y_val_pred_lr_tfidf))
    print(f"Fold {i+1} - Tfidf Test Results:")
    print("Accuracy:", accuracy_score(y_test, y_test_pred_lr_tfidf))
    print(classification_report(y_test, y_test_pred_lr_tfidf))

    print(f"Fold {i+1} - BoW Validation Results:")
    print("Accuracy:", accuracy_score(y_val, y_val_pred_lr_bow))
    print(classification_report(y_val, y_val_pred_lr_bow))
    print(f"Fold {i+1} - BoW Test Results:")
    print("Accuracy:", accuracy_score(y_test, y_test_pred_lr_bow))
    print(classification_report(y_test, y_test_pred_lr_bow))

    print(f"Fold {i+1} - Word2Vec Validation Results:")
    print("Accuracy:", accuracy_score(y_val, y_val_pred_lr_w2v))
    print(classification_report(y_val, y_val_pred_lr_w2v))
    print(f"Fold {i+1} - Word2Vec Test Results:")
    print("Accuracy:", accuracy_score(y_test, y_test_pred_lr_w2v))
    print(classification_report(y_test, y_test_pred_lr_w2v))

ValueError: Found input variables with inconsistent numbers of samples: [4361, 4362]