Import all dependencies

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/suonieo1/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Read the data from the file. Rename the columns to "Text" and "Sentiment".

In [6]:
df = pd.read_csv('data.csv', delimiter=',', encoding='latin-1', header=None)
df = df.rename(columns=lambda x: ['Sentiment', 'Text'][x])
df = df[['Text', 'Sentiment']]

In [7]:
def preprocess_text(text):
    # Tokenization (split the text into words)
    words = nltk.word_tokenize(text)
    
    # Remove punctuation and convert to lowercase
    words = [word.lower() for word in words if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Stemming (reducing words to their root form)
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    # Join the cleaned words back into a single string
    cleaned_text = ' '.join(words)
    
    return cleaned_text

In [8]:
# Apply preprocessing to each text in the DataFrame
df['Text'] = df['Text'].apply(preprocess_text)

# The 'Text' column now contains preprocessed text
print(df.head())

                                                Text Sentiment
0  accord gran compani plan move product russia a...   neutral
1  technopoli plan develop stage area less squar ...   neutral
2  intern electron industri compani elcoteq laid ...  negative
3  new product plant compani would increas capac ...  positive
4  accord compani updat strategi year baswar targ...  positive


In [6]:



# Split the data into train, validation, and test sets
X = df['Text']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)  # You can adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Modeling: Implement the models mentioned in your paper
# 1. Convolutional Neural Networks (CNNs) - You'll need a deep learning library like TensorFlow or PyTorch for this
# 2. Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# 3. Other models (you can add more models as needed)
# Example: Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Evaluate models on the validation set
y_val_pred_lr = lr_model.predict(X_val_tfidf)
y_val_pred_nb = nb_model.predict(X_val_tfidf)

# Print evaluation metrics
print("Logistic Regression Validation Results:")
print("Accuracy:", accuracy_score(y_val, y_val_pred_lr))
print(classification_report(y_val, y_val_pred_lr))

print("Multinomial Naive Bayes Validation Results:")
print("Accuracy:", accuracy_score(y_val, y_val_pred_nb))
print(classification_report(y_val, y_val_pred_nb))

# You can similarly evaluate the other models you mentioned in your paper

# Once you've selected the best-performing model on the validation set, you can evaluate it on the test set
# Example for Logistic Regression
y_test_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Test Results:")
print("Accuracy:", accuracy_score(y_test, y_test_pred_lr))
print(classification_report(y_test, y_test_pred_lr))


Logistic Regression Validation Results:
Accuracy: 0.7628865979381443
              precision    recall  f1-score   support

    negative       0.86      0.37      0.52        99
     neutral       0.75      0.96      0.84       466
    positive       0.80      0.52      0.63       211

    accuracy                           0.76       776
   macro avg       0.80      0.62      0.66       776
weighted avg       0.78      0.76      0.74       776

Multinomial Naive Bayes Validation Results:
Accuracy: 0.6597938144329897
              precision    recall  f1-score   support

    negative       1.00      0.04      0.08        99
     neutral       0.66      0.98      0.79       466
    positive       0.62      0.25      0.35       211

    accuracy                           0.66       776
   macro avg       0.76      0.42      0.41       776
weighted avg       0.69      0.66      0.58       776

Logistic Regression Test Results:
Accuracy: 0.7618556701030927
              precision    recall