In [1]:
import pandas as pd
import re
import sklearn.metrics
import nltk
import nlpaug.augmenter.word as naw
import numpy as np

import nltk

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [74]:
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from scipy.stats import uniform
from scipy import interp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score

In [4]:

# load the data from the csv file
train_data = pd.read_csv("reviews.csv")
# train_label = train_data["Sentiment"]
original_data = pd.DataFrame(train_data["Text"])


In [5]:
# define a function to preprocess the text data
def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    # remove non-alphabetic characters
    text = re.sub(r'[^a-z]', ' ', text)
    # tokenize the text into words
    tokens = word_tokenize(text)
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens= ' '.join(tokens)
    return tokens


# remove the html symbol
def remove_html(text):
    regex = r"<[^>]+>"
    text_new = re.sub(regex, " ", text)
    return text_new

# apply the preprocessing function to the text data
train_data['Text'] = train_data['Text'].apply(remove_html)
train_data['Text'] = train_data['Text'].apply(preprocess_text)


In [6]:

# define an NLPAug data augmentation function
def augment_text(text):
    # define an augmentation method
    aug = naw.SynonymAug(aug_src='wordnet', lang='eng')
    # apply the augmentation method to the text
    augmented_text = aug.augment(text)
    return augmented_text

# apply the augmentation function to the preprocessed text data
train_data['Text'] = train_data['Text'].apply(augment_text)

In [7]:
# over sampling
ros = RandomOverSampler(sampling_strategy='minority')

X = train_data['Text'].values.reshape(-1, 1)
y = train_data['Sentiment']
X_resampled, y_resampled = ros.fit_resample(X, y)
train_data = pd.DataFrame({'Text': X_resampled.ravel(), 'Sentiment': y_resampled})



In [8]:
# save the data to a new csv file
train_data.to_csv("oversampling_reviews.csv", index=False)

## Features

In [9]:
# train a Word2Vec model on the preprocessed text data
word2vec_model = Word2Vec(train_data['Text'], min_count=1)

# create a function to generate the word embedding vectors for each sentence
def generate_word_embedding(sentence):
    # initialize an empty array for the sentence vector
    sentence_vector = []
    # loop through each word in the sentence
    for word in sentence:
        try:
            # add the vector representation of the word to the sentence vector
            word_vector = word2vec_model.wv[word]
            sentence_vector.append(word_vector)
        except KeyError:
            # ignore words that are not in the vocabulary
            pass
    # take the mean of the word vectors to get the sentence vector
    sentence_vector = np.mean(sentence_vector, axis=0)
    return sentence_vector

In [10]:
# apply the generate_word_embedding() function to the preprocessed text data
train_data['embedding'] = train_data['Text'].apply(generate_word_embedding)

In [11]:
# create a new DataFrame for the feature matrix
embedding_size = word2vec_model.vector_size
features_df = pd.DataFrame(train_data['embedding'].tolist(), columns=[f'embedding_{i}' for i in range(embedding_size)])

In [12]:
# perform PCA with n_components set to retain 98% of variance
pca_emb = PCA(n_components=0.98)
features_emb_pca = pca_emb.fit_transform(features_df)

# create a new DataFrame for the PCA features
pca_emb_cols = [f"PC_emb{i+1}" for i in range(features_emb_pca.shape[1])]
pca_df_emb = pd.DataFrame(features_emb_pca, columns=pca_emb_cols)

In [13]:
# create a TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer()

# fit and transform the vectorizer on the preprocessed text data
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['Text'].apply(lambda x: ' '.join(x)))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_features_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_feature_names)

In [14]:

## standardize the features
#scaler = StandardScaler()
#features_std = scaler.fit_transform(features)

# perform PCA with n_components set to retain 95% of variance
pca = PCA(n_components=0.95)
features_tfidf_pca = pca.fit_transform(tfidf_features_df)

# create a new DataFrame for the PCA features
pca_tfidf_cols = [f"PC_tfidf{i+1}" for i in range(features_tfidf_pca.shape[1])]
pca_df_tfidf = pd.DataFrame(features_tfidf_pca, columns=pca_tfidf_cols)

In [15]:
# add the TF-IDF features to the feature matrix DataFrame
features_df = pd.concat([pca_df_tfidf, pca_df_emb], axis=1)

In [16]:
# add the number of characters, number of words, and number of capital characters as features
features_df['num_characters'] = train_data['Text'].apply(lambda x: len(' '.join(x)))
features_df['num_words'] = train_data['Text'].apply(lambda x: len(x))

In [17]:
# add the common features from the features.csv file
features_df['num_sentences'] = original_data["Text"].apply(lambda s: s.count('.'))
features_df['num_question_marks'] = original_data["Text"].apply(lambda s: s.count('?'))
features_df['num_exclamation_marks'] = original_data["Text"].apply(lambda s: s.count('!'))
features_df['num_unique_words'] = train_data["Text"].apply(lambda x: len(set(x)))

In [18]:
# add the label column to the feature matrix DataFrame
label = features_df.columns
features_df['Sentiment'] = train_data['Sentiment']

In [19]:

# weight the negative sentiment samples by 1.5
features_df.loc[features_df['Sentiment'] == 'negative',label] *= 2


In [29]:
features_df.head()


Unnamed: 0,PC_tfidf1,PC_tfidf2,PC_tfidf3,PC_tfidf4,PC_tfidf5,PC_tfidf6,PC_tfidf7,PC_tfidf8,PC_tfidf9,PC_tfidf10,...,PC_emb96,PC_emb97,PC_emb98,num_characters,num_words,num_sentences,num_question_marks,num_exclamation_marks,num_unique_words,Sentiment
0,-0.071812,0.036136,0.076162,0.058094,0.015693,-0.06898,-0.009392,0.050062,-0.086675,-0.005748,...,-0.007832,-0.004934,0.000907,98,1,4.0,0.0,0.0,1,positive
1,-0.130083,0.064077,0.186264,0.178121,0.043476,-0.063714,0.012203,0.060614,-0.090439,0.010428,...,0.008681,-0.003294,-0.003709,232,1,4.0,0.0,0.0,1,positive
2,-0.090828,0.038738,0.08659,0.030063,0.018296,0.020041,2e-06,0.236875,-0.103662,-0.137266,...,-0.002308,0.007702,-0.007528,541,1,9.0,0.0,0.0,1,positive
3,-0.032244,0.003467,0.016679,-0.011835,-0.005652,-0.009966,0.012565,0.031322,-0.033928,-0.066652,...,-0.001196,-0.003016,0.003852,311,1,3.0,0.0,2.0,1,positive
4,0.051115,-0.117562,0.050717,-0.013965,-0.013435,-0.031462,-0.059974,0.022282,0.005212,0.054587,...,0.007418,0.000506,0.007445,216,1,4.0,0.0,1.0,1,positive


In [None]:
# save the feature matrix to a CSV file
# pca_df_emb.to_csv("pca_df_emb.csv", index=False)
# pca_df_tfidf.to_csv("pca_df_tfidf.csv", index=False)
#features_df.to_csv("features.csv", index=False)

## XGBOOST























In [20]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [60]:
features_df['tag'] = features_df['Sentiment'].map(dict(positive=1, negative=0))
y=features_df['tag']

(8060,)

In [61]:
X=features_df.iloc[: , :-8]

(8060, 3437)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [65]:
#RandomSearchCV
# define the parameters to tune
param_dist = {"learning_rate": uniform(0, 2),
              "gamma": uniform(1, 0.000001),
              "max_depth": range(1,50),
              "n_estimators": range(1,300),
              "min_child_weight": range(1,10),
              'n_jobs': range(1,5)}
#instance of RandomSearchCV
rs = RandomizedSearchCV(XGBClassifier(), param_distributions=param_dist, n_iter=3)

In [70]:
rs.fit(X_train, y_train)

RandomizedSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                           callbacks=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=None,
                                           gpu_id=None, grow_policy=None,
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None, max_bin=None,
                                           max_c...
                                           predictor=None, random_state=None,
                                    

In [80]:
y_preds=rs.predict(X_test)

(1612,)

In [84]:
print(f1_score(y_test, y_preds))

Model Accuracy:  98.76 %
0.9876998769987699


## Ramdon Forest