In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import textblob
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem.porter import *
import re
import string
import nltk
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset
df = pd.read_csv('Twitter_Sentiments.csv', encoding='utf-8')
df.head() # Display the first 5 rows of the dataframe

In [None]:
# Define a function to remove patterns from text
def remove_pattern(input_txt, pattern):
    """
    Removes a given pattern from an input text.

    Args:
        input_txt: The input text.
        pattern: The pattern to remove.

    Returns:
        The input text with the pattern removed.
    """
    r = re.findall(pattern, input_txt)
    for word in r:
        input_txt = re.sub(word, "", input_txt)
    return input_txt


In [None]:
# Clean the tweets
df['clean_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")
df['clean_tweet'] = df['clean_tweet'].str.replace("[^a-zA-Z# \U0001F600-\U0001F64F]+", " ")
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: " ".join([w for w in x.split() if len(w) > 3]))
df.head()

In [None]:
# Tokenize the tweets
tokenized_tweet = df['clean_tweet'].apply(lambda x: x.split())

# Stem the tokens
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
tokenized_tweet = tokenized_tweet.apply(lambda sentence: [stemmer.stem(word) for word in sentence])

# Join the tokens back into sentences
for i in range(len(tokenized_tweet)):
  tokenized_tweet[i] = " ".join(tokenized_tweet[i])
df['clean_tweet'] = tokenized_tweet
df.head()

In [None]:
# Generate a word cloud for all tweets
all_words = " ".join([sentence for sentence in df['clean_tweet']])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show() # Display the word cloud

In [None]:
# Generate a word cloud for positive tweets
pos_words = " ".join([sentence for sentence in df['clean_tweet'][df['label'] == 0]])
wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(pos_words)
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show() # Display the word cloud

In [None]:
# Generate a word cloud for negative tweets
neg_words = " ".join([sentence for sentence in df['clean_tweet'][df['label'] == 1]])
wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(neg_words)
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show() # Display the word cloud

In [None]:
# Define a function to extract hashtags from tweets
def hashtag_extract(tweets):
    """
    Extracts hashtags from a list of tweets.

    Args:
        tweets: A list of tweets.

    Returns:
        A list of hashtags.
    """
    hashtags = []
    for tweet in tweets:
        ht = re.findall(r"#(\w+)", tweet)
        hashtags.append(ht)
    return hashtags


In [None]:
# Extract hashtags from positive and negative tweets
ht_positive = hashtag_extract(df['clean_tweet'][df['label'] == 0])
ht_negative = hashtag_extract(df['clean_tweet'][df['label'] == 1])

# Flatten the lists of hashtags
ht_positive = sum(ht_positive, [])
ht_negative = sum(ht_negative, [])

In [None]:
# Plot the top 10 hashtags for positive tweets
freq = nltk.FreqDist(ht_positive)
d = pd.DataFrame({'Hashtag': list(freq.keys()), 'Count': list(freq.values())})
d = d.nlargest(columns='Count', n=10)
plt.figure(figsize=(15,9))
sns.barplot(data=d, x='Hashtag', y='Count')
plt.show()  # Display the bar plot

In [None]:
# Plot the top 10 hashtags for negative tweets
freq = nltk.FreqDist(ht_negative)
d = pd.DataFrame({'Hashtag': list(freq.keys()), 'Count': list(freq.values())})
d = d.nlargest(columns='Count', n=10)
plt.figure(figsize=(15,9))
sns.barplot(data=d, x='Hashtag', y='Count')
plt.show()  # Display the bar plot

## Logistic Regression

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(df['clean_tweet'])

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split

# Train a logistic regression model
x_train, x_test, y_train, y_test = train_test_split(bow, df['label'], random_state=42, test_size=0.25)
model = LogisticRegression()
model.fit(x_train, y_train)

# Evaluate the model
pred = model.predict(x_test)
print('F1 Score:', f1_score(y_test, pred))
print('Accuracy:', accuracy_score(y_test, pred))

## Support Vector Machine (SVM)

In [None]:
from sklearn.svm import SVC

# Train a support vector machine model
x_train, x_test, y_train, y_test = train_test_split(bow, df['label'], random_state=42, test_size=0.25)
model = SVC(kernel='linear', random_state=0)
model.fit(x_train, y_train)

# Evaluate the model
pred = model.predict(x_test)
print('F1 Score:', f1_score(y_test, pred))
print('Accuracy:', accuracy_score(y_test, pred))

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Train a Naive Bayes model
x_train, x_test, y_train, y_test = train_test_split(bow, df['label'], random_state=42, test_size=0.25)
model = MultinomialNB()
model.fit(x_train, y_train)

# Evaluate the model
pred = model.predict(x_test)
print('F1 Score:', f1_score(y_test, pred))
print('Accuracy:', accuracy_score(y_test, pred))