## (1)BASE With corrected sentiments 

In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('tweet_emotions.csv', header=None, names=['id', 'sentiment', 'content'])

# Keep only the specified sentiment labels
allowed_sentiments = ['neutral', 'hate', 'happiness', 'sadness', 'worry', 'love']
df = df[df['sentiment'].isin(allowed_sentiments)]

# Reset the index after filtering the DataFrame
df.reset_index(drop=True, inplace=True)

# Preprocess the tweet content by removing special characters, stop words, and converting to lowercase
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text
df['content'] = df['content'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2, random_state=42)

# Print the number of training and testing samples
print('Number of training samples:', len(X_train))
print('Number of testing samples:', len(X_test))

# Convert the tweet content into a bag-of-words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train a Naive Bayes classifier on the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict the sentiment labels of the tweets in the testing set
y_pred_mapped = classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred_mapped)
print('Accuracy:', accuracy)
print('Classification report:\n', classification_report(y_test, y_pred_mapped))

# Create a DataFrame with the testing data and predicted emotion
# Add the predicted sentiment to the original DataFrame
df_test = pd.DataFrame({'content': df.iloc[y_test.index]['content'].values, 'predicted_sentiment': y_pred_mapped})
df_test['id'] = df.iloc[y_test.index]['id'].values
df_test['sentiment'] = df.iloc[y_test.index]['sentiment'].values

# Save the predicted sentiment DataFrame to a CSV file
df_test.to_csv('comparisonNaive.csv', index=False, columns=['id', 'sentiment', 'content', 'predicted_sentiment'])

# Print the number of correctly classified samples
num_correct = (y_test == y_pred_mapped).sum()
print('Number of correctly classified samples:', num_correct)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of training samples: 26108
Number of testing samples: 6528
Accuracy: 0.3866421568627451
Classification report:
               precision    recall  f1-score   support

   happiness       0.45      0.34      0.38      1061
        hate       0.29      0.01      0.02       254
        love       0.55      0.28      0.37       785
     neutral       0.41      0.36      0.38      1708
     sadness       0.35      0.13      0.19      1007
       worry       0.35      0.70      0.46      1713

    accuracy                           0.39      6528
   macro avg       0.40      0.30      0.30      6528
weighted avg       0.40      0.39      0.36      6528

Number of correctly classified samples: 2524


## (2)Stemming with Corrected sentiments 

In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer

# Initialize the PorterStemmer class
stemmer = PorterStemmer()


# Load the CSV file into a pandas DataFrame
df = pd.read_csv('tweet_emotions.csv', header=None, names=['id', 'sentiment', 'content'])

# Keep only the specified sentiment labels
allowed_sentiments = ['neutral', 'hate', 'happiness', 'sadness', 'worry', 'love']
df = df[df['sentiment'].isin(allowed_sentiments)]

# Reset the index after filtering the DataFrame
df.reset_index(drop=True, inplace=True)

# Preprocess the tweet content by removing special characters, stop words, and converting to lowercase
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words and perform stemming
    words = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    text = ' '.join(words)
    
    return text

df['content'] = df['content'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2, random_state=42)

# Print the number of training and testing samples
print('Number of training samples:', len(X_train))
print('Number of testing samples:', len(X_test))

# Convert the tweet content into a bag-of-words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train a Naive Bayes classifier on the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict the sentiment labels of the tweets in the testing set
y_pred_mapped = classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred_mapped)
print('Accuracy:', accuracy)
print('Classification report:\n', classification_report(y_test, y_pred_mapped))

# Create a DataFrame with the testing data and predicted emotion
# Add the predicted sentiment to the original DataFrame
df_test = pd.DataFrame({'content': df.iloc[y_test.index]['content'].values, 'predicted_sentiment': y_pred_mapped})
df_test['id'] = df.iloc[y_test.index]['id'].values
df_test['sentiment'] = df.iloc[y_test.index]['sentiment'].values

# Save the predicted sentiment DataFrame to a CSV file
df_test.to_csv('comparisonNaive.csv', index=False, columns=['id', 'sentiment', 'content', 'predicted_sentiment'])

# Print the number of correctly classified samples
num_correct = (y_test == y_pred_mapped).sum()
print('Number of correctly classified samples:', num_correct)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of training samples: 26108
Number of testing samples: 6528
Accuracy: 0.3874080882352941
Classification report:
               precision    recall  f1-score   support

   happiness       0.44      0.33      0.38      1061
        hate       0.33      0.01      0.02       254
        love       0.56      0.28      0.37       785
     neutral       0.40      0.37      0.39      1708
     sadness       0.37      0.14      0.20      1007
       worry       0.35      0.69      0.46      1713

    accuracy                           0.39      6528
   macro avg       0.41      0.30      0.30      6528
weighted avg       0.41      0.39      0.36      6528

Number of correctly classified samples: 2529


## (3) Negation with corrected sentiments 

In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer

# Initialize the PorterStemmer class
stemmer = PorterStemmer()


# Load the CSV file into a pandas DataFrame
df = pd.read_csv('tweet_emotions.csv', header=None, names=['id', 'sentiment', 'content'])
# Keep only the specified sentiment labels
allowed_sentiments = ['neutral', 'hate', 'happiness', 'sadness', 'worry', 'love']
df = df[df['sentiment'].isin(allowed_sentiments)]

# Reset the index after filtering the DataFrame
df.reset_index(drop=True, inplace=True)
# Preprocess the tweet content by removing special characters, stop words, and converting to lowercase
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words and perform stemming handling negations
    words = text.split()
    processed_words = []
    negate = False
    for i in range(len(words)):
        word = words[i]
        if word in stop_words:
            continue
        if negate:
            processed_words.append('NOT_' + stemmer.stem(word))
            negate = False
        elif i > 0 and words[i-1] in ['not', "n't", 'no', 'never']:
            processed_words.append('NOT_' + stemmer.stem(word))
        else:
            processed_words.append(stemmer.stem(word))
        if word in ['not', "n't", 'no', 'never']:
            negate = True
    
    text = ' '.join(processed_words)
    
    return text

df['content'] = df['content'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2, random_state=42)

# Print the number of training and testing samples
print('Number of training samples:', len(X_train))
print('Number of testing samples:', len(X_test))

# Convert the tweet content into a bag-of-words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train a Naive Bayes classifier on the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict the sentiment labels of the tweets in the testing set
y_pred_mapped = classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred_mapped)
print('Accuracy:', accuracy)
print('Classification report:\n', classification_report(y_test, y_pred_mapped))

# Create a DataFrame with the testing data and predicted emotion
# Add the predicted sentiment to the original DataFrame
df_test = pd.DataFrame({'content': df.iloc[y_test.index]['content'].values, 'predicted_sentiment': y_pred_mapped})
df_test['id'] = df.iloc[y_test.index]['id'].values
df_test['sentiment'] = df.iloc[y_test.index]['sentiment'].values

# Save the predicted sentiment DataFrame to a CSV file
df_test.to_csv('comparisonNaive.csv', index=False, columns=['id', 'sentiment', 'content', 'predicted_sentiment'])

# Print the number of correctly classified samples
num_correct = (y_test == y_pred_mapped).sum()
print('Number of correctly classified samples:', num_correct)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of training samples: 26108
Number of testing samples: 6528
Accuracy: 0.3874080882352941
Classification report:
               precision    recall  f1-score   support

   happiness       0.45      0.34      0.39      1061
        hate       0.33      0.01      0.02       254
        love       0.55      0.28      0.37       785
     neutral       0.41      0.37      0.39      1708
     sadness       0.36      0.13      0.20      1007
       worry       0.35      0.69      0.46      1713

    accuracy                           0.39      6528
   macro avg       0.41      0.30      0.30      6528
weighted avg       0.41      0.39      0.36      6528

Number of correctly classified samples: 2529


## (4) Removing mentions and Hastags with corrected sentiments 

In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer

# Initialize the PorterStemmer class
stemmer = PorterStemmer()


# Load the CSV file into a pandas DataFrame
df = pd.read_csv('tweet_emotions.csv', header=None, names=['id', 'sentiment', 'content'])

# Keep only the specified sentiment labels
allowed_sentiments = ['neutral', 'hate', 'happiness', 'sadness', 'worry', 'love']
df = df[df['sentiment'].isin(allowed_sentiments)]

# Reset the index after filtering the DataFrame
df.reset_index(drop=True, inplace=True)

# Preprocess the tweet content by removing special characters, stop words, and converting to lowercase
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    
     # Remove mentions (words starting with @)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    
    # Remove hashtags (words starting with #)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)

    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words and perform stemming
    words = text.split()
    processed_words = []
    negate = False
    for i in range(len(words)):
        word = words[i]
        if word in stop_words:
            continue
        if negate:
            processed_words.append('NOT_' + stemmer.stem(word))
            negate = False
        elif i > 0 and words[i-1] in ['not', "n't", 'no', 'never']:
            processed_words.append('NOT_' + stemmer.stem(word))
        else:
            processed_words.append(stemmer.stem(word))
        if word in ['not', "n't", 'no', 'never']:
            negate = True
    
    text = ' '.join(processed_words)
    
    return text

df['content'] = df['content'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2, random_state=42)

# Print the number of training and testing samples
print('Number of training samples:', len(X_train))
print('Number of testing samples:', len(X_test))

# Convert the tweet content into a bag-of-words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train a Naive Bayes classifier on the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict the sentiment labels of the tweets in the testing set
y_pred_mapped = classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred_mapped)
print('Accuracy:', accuracy)
print('Classification report:\n', classification_report(y_test, y_pred_mapped))

# Create a DataFrame with the testing data and predicted emotion
# Add the predicted sentiment to the original DataFrame
df_test = pd.DataFrame({'content': df.iloc[y_test.index]['content'].values, 'predicted_sentiment': y_pred_mapped})
df_test['id'] = df.iloc[y_test.index]['id'].values
df_test['sentiment'] = df.iloc[y_test.index]['sentiment'].values

# Save the predicted sentiment DataFrame to a CSV file
df_test.to_csv('comparisonNaive.csv', index=False, columns=['id', 'sentiment', 'content', 'predicted_sentiment'])

# Print the number of correctly classified samples
num_correct = (y_test == y_pred_mapped).sum()
print('Number of correctly classified samples:', num_correct)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of training samples: 26108
Number of testing samples: 6528
Accuracy: 0.3874080882352941
Classification report:
               precision    recall  f1-score   support

   happiness       0.45      0.34      0.39      1061
        hate       0.33      0.01      0.02       254
        love       0.55      0.28      0.37       785
     neutral       0.41      0.37      0.39      1708
     sadness       0.36      0.13      0.20      1007
       worry       0.35      0.69      0.46      1713

    accuracy                           0.39      6528
   macro avg       0.41      0.30      0.30      6528
weighted avg       0.41      0.39      0.36      6528

Number of correctly classified samples: 2529


## (5) Ada boosting with corrected sentiments 

In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer
from sklearn.ensemble import AdaBoostClassifier

# Initialize the PorterStemmer class
stemmer = PorterStemmer()


# Load the CSV file into a pandas DataFrame
df = pd.read_csv('tweet_emotions.csv', header=None, names=['id', 'sentiment', 'content'])

# Keep only the specified sentiment labels
allowed_sentiments = ['neutral', 'hate', 'happiness', 'sadness', 'worry', 'love']
df = df[df['sentiment'].isin(allowed_sentiments)]

# Reset the index after filtering the DataFrame
df.reset_index(drop=True, inplace=True)

# Preprocess the tweet content by removing special characters, stop words, and converting to lowercase
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    
     # Remove mentions (words starting with @)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    
    # Remove hashtags (words starting with #)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)

    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words and perform stemming
    words = text.split()
    processed_words = []
    negate = False
    for i in range(len(words)):
        word = words[i]
        if word in stop_words:
            continue
        if negate:
            processed_words.append('NOT_' + stemmer.stem(word))
            negate = False
        elif i > 0 and words[i-1] in ['not', "n't", 'no', 'never']:
            processed_words.append('NOT_' + stemmer.stem(word))
        else:
            processed_words.append(stemmer.stem(word))
        if word in ['not', "n't", 'no', 'never']:
            negate = True
    
    text = ' '.join(processed_words)
    
    return text

df['content'] = df['content'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2, random_state=42)

# Print the number of training and testing samples
print('Number of training samples:', len(X_train))
print('Number of testing samples:', len(X_test))

# Convert the tweet content into a bag-of-words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train a Naive Bayes classifier on the training set using AdaBoost
base_estimator = MultinomialNB()
classifier = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42)
classifier.fit(X_train, y_train)

# Predict the sentiment labels of the tweets in the testing set
y_pred_mapped = classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred_mapped)
print('Accuracy:', accuracy)
print('Classification report:\n', classification_report(y_test, y_pred_mapped))

# Create a DataFrame with the testing data and predicted emotion
# Add the predicted sentiment to the original DataFrame
df_test = pd.DataFrame({'content': df.iloc[y_test.index]['content'].values, 'predicted_sentiment': y_pred_mapped})
df_test['id'] = df.iloc[y_test.index]['id'].values
df_test['sentiment'] = df.iloc[y_test.index]['sentiment'].values

#Save the predicted sentiment DataFrame to a CSV file
df_test.to_csv('comparisonNaive.csv', index=False, columns=['id', 'sentiment', 'content', 'predicted_sentiment'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of training samples: 26108
Number of testing samples: 6528




Accuracy: 0.36580882352941174


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report:
               precision    recall  f1-score   support

   happiness       0.48      0.07      0.12      1061
        hate       0.00      0.00      0.00       254
        love       0.60      0.28      0.38       785
     neutral       0.34      0.74      0.46      1708
     sadness       0.56      0.01      0.03      1007
       worry       0.37      0.47      0.41      1713

    accuracy                           0.37      6528
   macro avg       0.39      0.26      0.23      6528
weighted avg       0.42      0.37      0.30      6528



### (6) USing CNN insted of Naive with corrected sentiments 

In [None]:
import pandas as pd
import re
import nltk
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))
def preprocess_text(text):
     # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    
     # Remove mentions (words starting with @)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    
    # Remove hashtags (words starting with #)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)

    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words and perform stemming
    words = text.split()
    processed_words = []
    negate = False
    for i in range(len(words)):
        word = words[i]
        if word in stop_words:
            continue
        if negate:
            processed_words.append('NOT_' + stemmer.stem(word))
            negate = False
        elif i > 0 and words[i-1] in ['not', "n't", 'no', 'never']:
            processed_words.append('NOT_' + stemmer.stem(word))
        else:
            processed_words.append(stemmer.stem(word))
        if word in ['not', "n't", 'no', 'never']:
            negate = True
    
    text = ' '.join(processed_words)
    
    return text

df = pd.read_csv('tweet_emotions.csv', header=None, names=['id', 'sentiment', 'content'])
df['content'] = df['content'].apply(preprocess_text)

# Keep only the specified sentiment labels
allowed_sentiments = ['neutral', 'hate', 'happiness', 'sadness', 'worry', 'love']
df = df[df['sentiment'].isin(allowed_sentiments)]

# Reset the index after filtering the DataFrame
df.reset_index(drop=True, inplace=True)

# Tokenize the text and pad the sequences
max_words = 10000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df['content'])

X = tokenizer.texts_to_sequences(df['content'])
X = pad_sequences(X, padding='post')
y = pd.get_dummies(df['sentiment']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the CNN model
embedding_dim = 100
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=X.shape[1]),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(30, activation='relu'),
    Dense(y.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the CNN model
epochs = 5
batch_size = 32
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1)

# Evaluate the performance
y_pred = model.predict(X_test)
y_pred_mapped = np.argmax(y_pred, axis=1)
y_test_mapped = np.argmax(y_test, axis=1)

accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
print('Accuracy:', accuracy)
print('Classification report:\n', classification_report(y_test_mapped, y_pred_mapped))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 24, 100)           1000000   
                                                                 
 conv1d (Conv1D)             (None, 20, 128)           64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 30)                3870      
                                                                 
 dense_1 (Dense)             (None, 6)                 186       
                                                                 
Total params: 1,068,184
Trainable params: 1,068,184
Non-trainable params: 0
______________________________________________

## (7)Using a bigger database by connecting 2 databases  with corrected sentiments

In [None]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer

# Initialize the PorterStemmer class
stemmer = PorterStemmer()


# Load the CSV file into a pandas DataFrame
df = pd.read_csv('combined_data.csv', header=None, names=['sentiment', 'content','Original Content'])

# Keep only the specified sentiment labels
allowed_sentiments = ['neutral', 'hate', 'happiness', 'sadness', 'worry', 'love']
df = df[df['sentiment'].isin(allowed_sentiments)]

# Reset the index after filtering the DataFrame
df.reset_index(drop=True, inplace=True)

# Preprocess the tweet content by removing special characters, stop words, and converting to lowercase
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9\s]', '', text)
    
     # Remove mentions (words starting with @)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    
    # Remove hashtags (words starting with #)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)

    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words and perform stemming
    words = text.split()
    processed_words = []
    negate = False
    for i in range(len(words)):
        word = words[i]
        if word in stop_words:
            continue
        if negate:
            processed_words.append('NOT_' + stemmer.stem(word))
            negate = False
        elif i > 0 and words[i-1] in ['not', "n't", 'no', 'never']:
            processed_words.append('NOT_' + stemmer.stem(word))
        else:
            processed_words.append(stemmer.stem(word))
        if word in ['not', "n't", 'no', 'never']:
            negate = True
    
    text = ' '.join(processed_words)
    
    return text

df['content'] = df['content'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['content'], df['sentiment'], test_size=0.2, random_state=42)

# Print the number of training and testing samples
print('Number of training samples:', len(X_train))
print('Number of testing samples:', len(X_test))

# Convert the tweet content into a bag-of-words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train a Naive Bayes classifier on the training set
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict the sentiment labels of the tweets in the testing set
y_pred_mapped = classifier.predict(X_test)

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred_mapped)
print('Accuracy:', accuracy)
print('Classification report:\n', classification_report(y_test, y_pred_mapped))

# Create a DataFrame with the testing data and predicted emotion
# Add the predicted sentiment to the original DataFrame
df_test = pd.DataFrame({'content': df.iloc[y_test.index]['content'].values, 'predicted_sentiment': y_pred_mapped})
df_test['id'] = df.iloc[y_test.index]['id'].values
df_test['sentiment'] = df.iloc[y_test.index]['sentiment'].values

# Save the predicted sentiment DataFrame to a CSV file
df_test.to_csv('comparisonNaive.csv', index=False, columns=['id', 'sentiment', 'content', 'predicted_sentiment'])

# Print the number of correctly classified samples
num_correct = (y_test == y_pred_mapped).sum()
print('Number of correctly classified samples:', num_correct)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Number of training samples: 759368
Number of testing samples: 189843
Accuracy: 0.8252819435006822
Classification report:
               precision    recall  f1-score   support

   happiness       0.86      0.90      0.88     61399
        hate       0.86      0.76      0.81     60197
        love       0.65      0.01      0.03       753
     neutral       0.31      0.01      0.02      1735
     sadness       0.77      0.87      0.82     64013
       worry       0.29      0.01      0.02      1746

    accuracy                           0.83    189843
   macro avg       0.62      0.43      0.43    189843
weighted avg       0.82      0.83      0.82    189843



KeyError: ignored