# Sentiment Analysis Project

## Mohammad Javad Maheronnaghsh - 99105691

# Libraries

In [None]:
# install library
! pip install fasttext



In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from nltk.corpus import stopwords
from google.colab import drive
from tensorflow import keras
import tensorflow as tf
from io import StringIO
import pandas as pd
import numpy as np
import fasttext
import requests
import nltk
import re

# Data

In [None]:
link = 'https://drive.google.com/u/0/uc?id=1BRhlDz-JASvfEljf_XxrL-ZH4gUvOg0v&export=download'
r = requests.get(link)

In [None]:
df = pd.read_csv(StringIO(r.content.decode('utf-8')))

In [None]:
df

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
...,...,...,...,...
155043,155044,8485,Santa Clause 2 ' is wondrously creative .,4
155044,155045,8485,' is wondrously creative .,2
155045,155046,8485,is wondrously creative .,3
155046,155047,8485,is wondrously creative,4


# Data Split

In [None]:
y = df['Sentiment']
x = df.drop('Sentiment', axis = 1)

In [None]:
print(x.shape)
print(y.shape)

(155048, 3)
(155048,)


In [None]:
# Train/Validation/Test = 0.8/0.1/0.1
# stratify or not?
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.2)
x_valid, x_test, y_valid, y_test = train_test_split(x_, y_, test_size=0.5)

# Preprocessing
Here we are going to preprocess the data
and extract the features

In [None]:
# Download resources from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    if len(tokens) <= 1:
      return "random text"
    return ' '.join(tokens)

x_train['Phrase'] = x_train['Phrase'].apply(preprocess_text)
x_valid['Phrase'] = x_valid['Phrase'].apply(preprocess_text)
x_test['Phrase'] = x_test['Phrase'].apply(preprocess_text)

[nltk_data] Error loading stopwords: HTTP Error 503: Backend is
[nltk_data]     unhealthy
[nltk_data] Error loading punkt: HTTP Error 503: Backend is unhealthy


## First Approach: Basic Tools

What to do?


1.   Convert Capital Letters to Little Ones (English words)
2.   Delete Punctuations, etc.
3.   Delete Stop Words
4.   Find Origination of words
5.   Provide a list of remaind words
6.   Find TF-IDF vector

**Note: Weight of each token should be determined using train data**


### Vector Creation

In [None]:
vectorizer = TfidfVectorizer(max_features=100)


train_tfidf = vectorizer.fit_transform(x_train['Phrase'])
test_tfidf = vectorizer.fit_transform(x_test['Phrase'])
valid_tfidf = vectorizer.fit_transform(x_valid['Phrase'])

x_train_scaled_tfidf = train_tfidf.toarray()
x_test_scaled_tfidf = test_tfidf.toarray()
x_valid_scaled_tfidf = valid_tfidf.toarray()

## Second Approach: Using Ready Tools

What to do?
We want to use one of the following models:
1.  Word2Vec
2.  FastText
3.  GloVe

These are the models that are frequently used in NLP areas for deature extraction from words and statements.

### Vector Creation

In [None]:
# Create a file containing training data
train_file_unsupervised = 'train_unsupervised.txt'
with open(train_file_unsupervised, 'w', encoding='utf-8') as f:
    for row in x_train.iterrows():
        f.write(row[1][2] + '\n')

In [None]:
# Train FastText model
model_unsuervised = fasttext.train_unsupervised(train_file_unsupervised)

In [None]:
# Function to extract FastText vectors for a given text
def extract_vectors(text):
    return model_unsuervised.get_sentence_vector(text)

# Extract vectors for training data
train_vectors = [extract_vectors(row[1][2]) for row in x_train.iterrows()]

# Extract vectors for test data
test_vectors = [extract_vectors(row[1][2]) for row in x_test.iterrows()]

# Extract vectors for validation data
valid_vectors = [extract_vectors(row[1][2]) for row in x_valid.iterrows()]


In [None]:
print(len(train_vectors))
print(len(test_vectors))
print(len(valid_vectors))

124038
15505
15505


### Normalize Vectors

In [None]:
scaler = preprocessing.StandardScaler().fit(train_vectors)
x_train_scaled = scaler.transform(train_vectors)

x_test_scaled = scaler.transform(test_vectors)

x_valid_scaled = scaler.transform(valid_vectors)

# Train the Models - First Approach

## Logistic Regression


In [None]:
x_train_scaled_tfidf.shape

(124038, 100)

In [None]:
best_accuracy_lg_model_tfidf = 0.0
best_lg_model_tfidf = None


for c in [0.1, 1, 2]:
  for penalty in ['l1', 'l2']:
    for max_iter in [1000, 2000]:
      print('*')
      # Define Logistic Regression model and set hyperparameters
      lg_model = LogisticRegression(C=1.0, penalty='l2', max_iter=2000)

      # Train the Logistic Regression model
      lg_model.fit(x_train_scaled_tfidf, y_train)

      # Predict using the trained model
      predictions = lg_model.predict(x_valid_scaled_tfidf)
      accuracy = accuracy_score(y_valid, predictions)

      if accuracy > best_accuracy_lg_model_tfidf:
        best_accuracy_lg_model_tfidf = accuracy
        best_lg_model_tfidf = lg_model





*
*
*
*
*
*
*
*
*
*
*
*


## SVM

In [None]:
'''This code takes long time to RUN!'''


# from sklearn import svm
# from sklearn.metrics import accuracy_score

# # Normalize or scale FastText vectors if necessary

# # Define SVM model and set hyperparameters
# model = svm.SVC(kernel='linear', C=10.0)

# # Train the SVM model
# model.fit(train_vectors, y_train)

# # Predict using the trained model
# predictions = model.predict(test_vectors)

# # Evaluate the model
# accuracy = accuracy_score(y_test, predictions)

'This code takes long time to RUN!'

## Decision Tree

In [61]:


best_accuracy_ldt_model_tfidf = 0.0
best_dt_model_tfidf = None
for criterion in ['entropy', 'gini']:
    for max_d in [3, 30]:
      for min_s in [2, 4, 6]:
        print('*')

        dt_model = DecisionTreeClassifier(max_depth=max_d, min_samples_split=min_s, criterion=criterion)

        dt_model.fit(x_train_scaled_tfidf, y_train)

        predictions = dt_model.predict(x_valid_scaled_tfidf)

        accuracy = accuracy_score(y_valid, predictions)

        if accuracy > best_accuracy_ldt_model_tfidf:
          best_accuracy_ldt_model_tfidf = accuracy
          best_dt_model_tfidf = dt_model


*
*
*
*
*
*
*
*
*
*
*
*


In [None]:
best_accuracy_ldt_model_tfidf

0.5036439858110286

## AdaBoost

In [None]:


base_classifier = DecisionTreeClassifier(max_depth=3)

adaboost_model_tfidf = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=50)

adaboost_model_tfidf.fit(x_train_scaled_tfidf, y_train)





## Simple Neural Network Model

In [None]:


nn_model_tfidf = keras.Sequential()
nn_model_tfidf.add(keras.layers.Dense(64, activation='relu', input_shape=(x_train_scaled_tfidf.shape[1],)))
nn_model_tfidf.add(keras.layers.Dense(32, activation='relu'))
nn_model_tfidf.add(keras.layers.Dense(5, activation='softmax'))

# Compile the model
nn_model_tfidf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
nn_model_tfidf.fit(x_train_scaled_tfidf, y_train, validation_data=(x_valid_scaled_tfidf, y_valid), epochs=30, batch_size=32)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f40061ab070>

## Deep Neural Network

In [None]:
# Define NN model architecture
deep_tfidf = keras.Sequential()
deep_tfidf.add(keras.layers.Dense(512, activation='relu', input_shape=(x_train_scaled_tfidf.shape[1],)))
deep_tfidf.add(keras.layers.Dropout(0.2))  # reduce overfitting
deep_tfidf.add(keras.layers.Dense(256, activation='relu'))
deep_tfidf.add(keras.layers.Dropout(0.2))
deep_tfidf.add(keras.layers.Dense(128, activation='relu'))
deep_tfidf.add(keras.layers.BatchNormalization())  # improved training stability
deep_tfidf.add(keras.layers.Dense(64, activation='relu'))
deep_tfidf.add(keras.layers.Dropout(0.2))
deep_tfidf.add(keras.layers.Dense(32, activation='relu'))
deep_tfidf.add(keras.layers.Dropout(0.2))
deep_tfidf.add(keras.layers.Dense(5, activation='softmax'))  # softmax activation


deep_tfidf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

deep_tfidf.fit(x_train_scaled_tfidf, y_train, validation_data=(x_valid_scaled_tfidf, y_valid), epochs=30, batch_size=64)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f4112f13550>

# Evaluate the Models - First Approach

In [78]:
# Logistic Regression
print('- - - - - - - \nLogistic Regression Model')
lg_preds = best_lg_model_tfidf.predict(x_test_scaled_tfidf)
# test_accuracy = accuracy_score(y_test, lg_preds)
# print("Test Accuracy:", test_accuracy)
report = classification_report(y_test, lg_preds)
print(report)

- - - - - - - 
Logistic Regression Model
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       709
           1       0.29      0.06      0.09      2715
           2       0.54      0.91      0.68      7905
           3       0.29      0.15      0.20      3250
           4       0.33      0.01      0.01       926

    accuracy                           0.50     15505
   macro avg       0.29      0.22      0.20     15505
weighted avg       0.41      0.50      0.41     15505



In [None]:
# SVM
'''It took long time to execute and didn't finish (crashed colab)'''

In [76]:
# Decision Tree
print('- - - - - - - \nDecision Tree Model')
dt_preds = best_dt_model_tfidf.predict(x_test_scaled_tfidf)
# test_accuracy = accuracy_score(y_test, dt_preds)
# print("Test Accuracy:", test_accuracy)
report = classification_report(y_test, dt_preds)
print(report)

- - - - - - - 
Decision Tree Model
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       709
           1       0.34      0.05      0.09      2715
           2       0.52      0.98      0.68      7905
           3       0.00      0.00      0.00      3250
           4       0.00      0.00      0.00       926

    accuracy                           0.51     15505
   macro avg       0.17      0.21      0.15     15505
weighted avg       0.32      0.51      0.36     15505



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [77]:
# Adaboost
print('- - - - - - - \nAdaBoost Model')
ab_preds = adaboost_model_tfidf.predict(x_test_scaled_tfidf)
# test_accuracy = accuracy_score(y_test, ab_preds)
# print("Test Accuracy:", test_accuracy)
report = classification_report(y_test, ab_preds)
print(report)

- - - - - - - 
AdaBoost Model
              precision    recall  f1-score   support

           0       0.04      0.00      0.01       709
           1       0.12      0.19      0.15      2715
           2       0.46      0.57      0.51      7905
           3       0.30      0.13      0.19      3250
           4       0.18      0.02      0.03       926

    accuracy                           0.36     15505
   macro avg       0.22      0.18      0.18     15505
weighted avg       0.33      0.36      0.33     15505



In [72]:
# Simple Neural Networks

print('- - - - - - - \nNeural Network Model')
predictions = nn_model_tfidf.predict(x_test_scaled_tfidf)
nn_preds = [np.argmax(predictions[i]) for i in range(predictions.shape[0])]
# test_accuracy = accuracy_score(y_test, nn_preds)
# print("Test Accuracy:", test_accuracy)
report = classification_report(y_test, nn_preds)
print(report)

- - - - - - - 
Neural Network Model
              precision    recall  f1-score   support

           0       0.04      0.01      0.01       709
           1       0.13      0.20      0.16      2715
           2       0.47      0.58      0.52      7905
           3       0.31      0.11      0.17      3250
           4       0.22      0.05      0.09       926

    accuracy                           0.36     15505
   macro avg       0.23      0.19      0.19     15505
weighted avg       0.34      0.36      0.33     15505



In [71]:
# Deep
print('- - - - - - - \nDeep Model')
predictions = deep_tfidf.predict(x_test_scaled_tfidf)
deep_preds = [np.argmax(predictions[i]) for i in range(predictions.shape[0])]
# test_accuracy = accuracy_score(y_test, deep_preds)
# print("Test Accuracy:", test_accuracy)
report = classification_report(y_test, deep_preds)
print(report)

- - - - - - - 
Deep Model
Test Accuracy: 0.5039019671073847
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       709
           1       0.27      0.07      0.11      2715
           2       0.55      0.89      0.68      7905
           3       0.30      0.17      0.22      3250
           4       0.14      0.01      0.02       926

    accuracy                           0.50     15505
   macro avg       0.25      0.23      0.21     15505
weighted avg       0.40      0.50      0.41     15505



In [None]:
# RNN

In [None]:

# Create the ensemble model by combining predictions
ensemble_predictions = np.vstack((lg_preds, dt_preds, ab_preds)).T

# Create the voting classifier for ensemble
voting_classifier_tfidf = VotingClassifier(estimators=[
    ('lg', best_lg_model_tfidf),
    ('dt', best_dt_model_tfidf),
    ('ab', adaboost_model_tfidf),
    # ('nn', nn_model),
    # ('deep', deep)
], voting='hard')

voting_classifier_tfidf.fit(x_train_scaled_tfidf, y_train)





In [70]:
ensemble_predictions = voting_classifier_tfidf.predict(x_test_scaled_tfidf)
# accuracy_score(y_test, ensemble_predictions)
report = classification_report(y_test, ensemble_predictions)
print(report)

              precision    recall  f1-score   support

           0       0.04      0.00      0.00       709
           1       0.31      0.07      0.11      2715
           2       0.54      0.92      0.68      7905
           3       0.31      0.12      0.17      3250
           4       0.20      0.00      0.00       926

    accuracy                           0.51     15505
   macro avg       0.28      0.22      0.19     15505
weighted avg       0.41      0.51      0.40     15505



# Train the Models - Second Approach

## Logistic Regression


In [None]:
best_accuracy_lg_model = 0.0
best_lg_model = None
for c in [0.1, 1, 2]:
  for penalty in ['l1', 'l2']:
    for max_iter in [1000, 2000]:
      # Define Logistic Regression model and set hyperparameters
      lg_model = LogisticRegression(C=1.0, penalty='l2', max_iter=2000)

      # Train the Logistic Regression model
      lg_model.fit(x_train_scaled, y_train)

      # Predict using the trained model
      predictions = lg_model.predict(x_valid_scaled)
      accuracy = accuracy_score(y_valid, predictions)

      if accuracy > best_accuracy_lg_model:
        best_accuracy_lg_model = accuracy
        best_lg_model = lg_model





## SVM

In [None]:
'''This code takes long time to RUN!'''


# from sklearn import svm
# from sklearn.metrics import accuracy_score

# # Normalize or scale FastText vectors if necessary

# # Define SVM model and set hyperparameters
# model = svm.SVC(kernel='linear', C=10.0)

# # Train the SVM model
# model.fit(train_vectors, y_train)

# # Predict using the trained model
# predictions = model.predict(test_vectors)

# # Evaluate the model
# accuracy = accuracy_score(y_test, predictions)

'This code takes long time to RUN!'

## Decision Tree

In [63]:


best_accuracy_ldt_model = 0.0
best_dt_model = None
for criterion in ['entropy', 'gini']:
    for max_d in [3, 30]:
      for min_s in [2, 4, 6]:

        dt_model = DecisionTreeClassifier(max_depth=max_d, min_samples_split=min_s, criterion=criterion)

        dt_model.fit(x_train_scaled, y_train)

        predictions = dt_model.predict(x_valid_scaled)

        accuracy = accuracy_score(y_valid, predictions)

        if accuracy > best_accuracy_ldt_model:
          best_accuracy_ldt_model = accuracy
          best_dt_model = dt_model


In [64]:
best_accuracy_ldt_model

0.5759432441148017

## AdaBoost

In [None]:


base_classifier = DecisionTreeClassifier(max_depth=3)

adaboost_model = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=50)

adaboost_model.fit(x_train_scaled, y_train)





## Simple Neural Network Model

In [None]:


nn_model = keras.Sequential()
nn_model.add(keras.layers.Dense(64, activation='relu', input_shape=(x_train_scaled.shape[1],)))
nn_model.add(keras.layers.Dense(32, activation='relu'))
nn_model.add(keras.layers.Dense(5, activation='softmax'))

# Compile the model
nn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
nn_model.fit(x_train_scaled, y_train, validation_data=(x_valid_scaled, y_valid), epochs=30, batch_size=32)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f411c5fb0d0>

## Deep Neural Network

In [None]:
# Define NN model architecture
deep = keras.Sequential()
deep.add(keras.layers.Dense(512, activation='relu', input_shape=(x_train_scaled.shape[1],)))
deep.add(keras.layers.Dropout(0.2))  # reduce overfitting
deep.add(keras.layers.Dense(256, activation='relu'))
deep.add(keras.layers.Dropout(0.2))
deep.add(keras.layers.Dense(128, activation='relu'))
deep.add(keras.layers.BatchNormalization())  # improved training stability
deep.add(keras.layers.Dense(64, activation='relu'))
deep.add(keras.layers.Dropout(0.2))
deep.add(keras.layers.Dense(32, activation='relu'))
deep.add(keras.layers.Dropout(0.2))
deep.add(keras.layers.Dense(5, activation='softmax'))  # softmax activation


deep.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

deep.fit(x_train_scaled, y_train, validation_data=(x_valid_scaled, y_valid), epochs=30, batch_size=64)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f4106a3a440>

## Fasttext Model Supervised
Here we are going to define a fasttext model.

In [None]:
# Preparation
train = np.hstack((x_train, y_train.values.reshape(-1, 1)))
df = pd.DataFrame(train, columns=['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'])

# Training data
train_sentences = x_train['Phrase'].tolist()
train_labels = y_train.tolist()

# Validation data
val_sentences = x_valid['Phrase'].tolist()
val_labels = y_valid.tolist()

# Test data
test_sentences = x_test['Phrase'].tolist()
test_labels = y_test.tolist()


df.to_csv('preprocessed_train_dataset.csv', index=False, header=False, columns=['Phrase'])

# FastText format
train_data = ['__label__' + str(label) + ' ' + sentence for label, sentence in zip(train_labels, train_sentences)]

# Save to a text file
train_data_path = 'preprocessed_train_data.txt'
with open(train_data_path, 'w') as f:
    for item in train_data:
        f.write("%s\n" % item)


val_labels = ['__label__' + str(x) for x in val_labels]
test_labels = ['__label__' + str(x) for x in test_labels]

In [None]:
best_accuracy = 0
fasttext_model_supervised = None

for lr in [0.05, 0.1]:
  for epoch in [10, 30]:
    for dim in [10, 100]:
      # Train FastText model
      ft_model = fasttext.train_supervised(
          input=train_data_path,
          epoch=epoch,
          lr=lr,
          wordNgrams=2,
          bucket=200000,
          dim=dim
      )

      # Predict on validation data
      val_predictions = [ft_model.predict(text)[0][0] for text in val_sentences]

      # Calculate accuracy on validation data

      val_accuracy = sum([val_labels[i] == val_predictions[i] for i in range(len(val_labels))]) / len(val_labels)
      if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        fasttext_model_supervised = ft_model



# Evaluate the Models - Second Approach

In [79]:
# Logistic Regression
print('- - - - - - - \nLogistic Regression Model')
lg_preds = best_lg_model.predict(x_test_scaled)
# test_accuracy = accuracy_score(y_test, lg_preds)
# print("Test Accuracy:", test_accuracy)
report = classification_report(y_test, lg_preds)
print(report)

- - - - - - - 
Logistic Regression Model
              precision    recall  f1-score   support

           0       0.41      0.03      0.05       709
           1       0.40      0.13      0.20      2715
           2       0.57      0.89      0.69      7905
           3       0.43      0.27      0.33      3250
           4       0.42      0.06      0.10       926

    accuracy                           0.54     15505
   macro avg       0.45      0.27      0.27     15505
weighted avg       0.49      0.54      0.46     15505



In [None]:
# SVM

In [81]:
# Decision Tree
print('- - - - - - - \nDecision Tree Model')
dt_preds = best_dt_model.predict(x_test_scaled)
# test_accuracy = accuracy_score(y_test, dt_preds)
# print("Test Accuracy:", test_accuracy)
report = classification_report(y_test, dt_preds)
print(report)

- - - - - - - 
Decision Tree Model
              precision    recall  f1-score   support

           0       0.32      0.38      0.35       709
           1       0.42      0.40      0.41      2715
           2       0.68      0.74      0.71      7905
           3       0.48      0.40      0.43      3250
           4       0.39      0.32      0.35       926

    accuracy                           0.57     15505
   macro avg       0.46      0.45      0.45     15505
weighted avg       0.56      0.57      0.56     15505



In [None]:
# Adaboost
print('- - - - - - - \nAdaBoost Model')
ab_preds = adaboost_model.predict(x_test_scaled)
test_accuracy = accuracy_score(y_test, ab_preds)
print("Test Accuracy:", test_accuracy)

- - - - - - - 
Decision Tree Model
Test Accuracy: 0.5295711060948082


In [82]:
# Simple Neural Networks

print('- - - - - - - \nNeural Network Model')
predictions = nn_model.predict(x_test_scaled)
nn_preds = [np.argmax(predictions[i]) for i in range(predictions.shape[0])]
# test_accuracy = accuracy_score(y_test, nn_preds)
# print("Test Accuracy:", test_accuracy)
report = classification_report(y_test, nn_preds)
print(report)

- - - - - - - 
Neural Network Model
              precision    recall  f1-score   support

           0       0.46      0.10      0.16       709
           1       0.43      0.36      0.39      2715
           2       0.65      0.83      0.73      7905
           3       0.47      0.42      0.44      3250
           4       0.55      0.10      0.17       926

    accuracy                           0.58     15505
   macro avg       0.51      0.36      0.38     15505
weighted avg       0.56      0.58      0.55     15505



In [83]:
# Deep
print('- - - - - - - \nDeep Model')
predictions = deep.predict(x_test_scaled)
deep_preds = [np.argmax(predictions[i]) for i in range(predictions.shape[0])]
# test_accuracy = accuracy_score(y_test, deep_preds)
# print("Test Accuracy:", test_accuracy)
report = classification_report(y_test, deep_preds)
print(report)

- - - - - - - 
Deep Model
              precision    recall  f1-score   support

           0       0.48      0.24      0.32       709
           1       0.49      0.47      0.48      2715
           2       0.70      0.81      0.75      7905
           3       0.54      0.49      0.52      3250
           4       0.54      0.32      0.40       926

    accuracy                           0.63     15505
   macro avg       0.55      0.47      0.49     15505
weighted avg       0.61      0.63      0.61     15505



In [85]:
# FastText
print('- - - - - - - \nFastText Model Supervised')
test_predictions = [fasttext_model_supervised.predict(text)[0][0] for text in test_sentences]
# test_accuracy = sum([test_labels[i] == test_predictions[i] for i in range(len(test_labels))]) / len(test_labels)
# print("Test Accuracy:", test_accuracy)
report = classification_report(test_labels, test_predictions)
print(report)

- - - - - - - 
FastText Model Supervised
              precision    recall  f1-score   support

  __label__0       0.49      0.37      0.42       709
  __label__1       0.56      0.48      0.51      2715
  __label__2       0.72      0.83      0.77      7905
  __label__3       0.58      0.50      0.53      3250
  __label__4       0.53      0.41      0.46       926

    accuracy                           0.65     15505
   macro avg       0.58      0.52      0.54     15505
weighted avg       0.64      0.65      0.64     15505



In [None]:
# RNN

In [None]:

# Create the ensemble model by combining predictions
ensemble_predictions = np.vstack((lg_preds, dt_preds, ab_preds)).T

# Create the voting classifier for ensemble
voting_classifier = VotingClassifier(estimators=[
    ('lg', best_lg_model),
    ('dt', best_dt_model),
    ('ab', adaboost_model),
    # ('nn', nn_model),
    # ('deep', deep)
], voting='hard')

voting_classifier.fit(x_train_scaled, y_train)





In [86]:
ensemble_predictions = voting_classifier.predict(x_test_scaled)
# accuracy_score(y_test, ensemble_predictions)
report = classification_report(y_test, ensemble_predictions)
print(report)

              precision    recall  f1-score   support

           0       0.29      0.07      0.11       709
           1       0.41      0.08      0.13      2715
           2       0.54      0.95      0.69      7905
           3       0.46      0.13      0.20      3250
           4       0.55      0.03      0.06       926

    accuracy                           0.53     15505
   macro avg       0.45      0.25      0.24     15505
weighted avg       0.49      0.53      0.43     15505



# Conclusion
We can conclude some notes according to the project.

# Kaggle Competition

In [88]:
drive.mount('/content/drive')
csv_path = '/content/drive/MyDrive/kaggle_test.csv'

kaggle_df = pd.read_csv(csv_path)


def save(preds, name):
  indexes = [i+1 for i in range(len(preds))]
  data = {'Sentiment': preds, 'ID': indexes}
  df_csv = pd.DataFrame(data)
  csv_file_path = name + '.csv'
  df_csv.to_csv(csv_file_path, index=False)

Mounted at /content/drive


In [89]:
# Preprocessing
kaggle_df['Phrase'] = kaggle_df['Phrase'].apply(preprocess_text)
kaggle_vectors = [extract_vectors(row[1][2]) for row in df.iterrows()]
scaler = preprocessing.StandardScaler().fit(kaggle_vectors)
kaggle_scaled = scaler.transform(kaggle_vectors)

## FastText

In [90]:
kaggle_sentences = df['Phrase'].tolist()
kaggle_preds_fasttext = [fasttext_model_supervised.predict(text)[0][0] for text in kaggle_sentences]
kaggle_preds_fasttext = [x[-1] for x in kaggle_preds_fasttext]
save(kaggle_preds_fasttext, 'fasttext')

## Adaboost

In [92]:
kaggle_preds_ab = adaboost_model.predict(kaggle_scaled)
save(kaggle_preds_ab, 'adaboost')

## Logistic Regression

In [93]:
kaggle_preds_lr = best_lg_model.predict(kaggle_scaled)
save(kaggle_preds_lr, 'logistic_regression')

## Decision Tree

In [94]:
kaggle_preds_dt = best_dt_model.predict(kaggle_scaled)
save(kaggle_preds_dt, 'decision_tree')

## Simple Neural Network

In [95]:
kaggle_preds_nn = nn_model.predict(kaggle_scaled)
kaggle_preds_nn = [np.argmax(kaggle_preds_nn[i]) for i in range(kaggle_preds_nn.shape[0])]
save(kaggle_preds_nn, 'neural_networks')



## Deep Neural Network

In [96]:
kaggle_preds_deep = deep.predict(kaggle_scaled)
kaggle_preds_deep = [np.argmax(kaggle_preds_deep[i]) for i in range(kaggle_preds_deep.shape[0])]
save(kaggle_preds_deep, 'deep')



## Ensemble

In [97]:
kaggle_preds_deep = voting_classifier.predict(kaggle_scaled)
kaggle_preds_ens = [np.argmax(kaggle_preds_deep[i]) for i in range(kaggle_preds_deep.shape[0])]
save(kaggle_preds_ens, 'ensemble')