In [None]:
import pandas as pd

train_data = pd.read_csv('twitter_training.csv', header=None)
validation_data = pd.read_csv('twitter_validation.csv', header=None)

In [None]:
column_names = ['id', 'topic', 'sentiment', 'text']
train_data.columns = column_names
validation_data.columns = column_names

In [None]:
print(train_data.columns)
print(validation_data.columns)

In [None]:
print(train_data.head())
print(validation_data.head())

## Preprocess the Data

In [None]:
import re

def clean_text(text):
    text = str(text)  
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'\W', ' ', text)  
    text = text.lower()  
    return text

In [None]:
# Ensure 'text' column is of string type and handle missing values
train_data['text'] = train_data['text'].astype(str).fillna('')
validation_data['text'] = validation_data['text'].astype(str).fillna('')

In [None]:
# Apply the cleaning function to the training and validation data
train_data['cleaned_text'] = train_data['text'].apply(clean_text)
validation_data['cleaned_text'] = validation_data['text'].apply(clean_text)

In [None]:
# Display the first few rows of the cleaned datasets
print(train_data.head())
print(validation_data.head())

## Sentiment Analysis

In [None]:
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis')

In [None]:
# Apply the sentiment analysis pipeline to the cleaned text
train_data['predicted_sentiment'] = train_data['cleaned_text'].apply(lambda x: sentiment_pipeline(x)[0]['label'])
validation_data['predicted_sentiment'] = validation_data['cleaned_text'].apply(lambda x: sentiment_pipeline(x)[0]['label'])

In [None]:
# Display the first few rows with predicted sentiment labels
print(train_data.head())
print(validation_data.head())

## Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Set the style for the visualizations
sns.set(style="darkgrid")

In [None]:
# Plot the distribution of sentiments in the training data
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=train_data)
plt.title('Sentiment Distribution in Training Data')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot the distribution of sentiments in the validation data
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=validation_data)
plt.title('Sentiment Distribution in Validation Data')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# Generate word clouds for positive and negative sentiments
positive_text = ' '.join(train_data[train_data['sentiment'] == 'Positive']['cleaned_text'])
negative_text = ' '.join(train_data[train_data['sentiment'] == 'Negative']['cleaned_text'])

plt.figure(figsize=(10, 5))
wordcloud_positive = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
plt.imshow(wordcloud_positive, interpolation='bilinear')
plt.title('Word Cloud for Positive Sentiment')
plt.axis('off')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
wordcloud_negative = WordCloud(width=800, height=400, background_color='black').generate(negative_text)
plt.imshow(wordcloud_negative, interpolation='bilinear')
plt.title('Word Cloud for Negative Sentiment')
plt.axis('off')
plt.show()