In [None]:
#df.dropna(subset=['text'], inplace=True)  # Drop rows where tweets are missing
#df.reset_index(drop=True, inplace=True)

In [None]:
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def clean_tweet_spacy(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions (@user)
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

    doc = nlp(text)  # Process text using spaCy
    tokens = [token.lemma_ for token in doc if not token.is_stop]  # Lemmatization & stopword removal
    return " ".join(tokens)

df['cleaned_tweet'] = df['text'].apply(clean_tweet_spacy)
print(df[['text', 'cleaned_tweet']].head())

In [None]:
print(df['cleaned_tweet'].dtype)  # Should be "object" (string)

In [None]:
print(df.dtypes)  # Check data types of all columns

In [None]:
print(df['cleaned_tweet'].head())  # Check first few rows

In [None]:
print(df['cleaned_tweet'].isnull().sum())  # Count missing values

In [None]:
df['cleaned_tweet'] = df['cleaned_tweet'].fillna("")
df['cleaned_tweet'] = df['cleaned_tweet'].astype(str)

df['cleaned_tweet'] = df['cleaned_tweet'].astype(str).fillna("")

In [None]:
all_words = ' '.join(df['cleaned_tweet'].dropna())  # Drop NaN values before joining
# Join all tweets into a single string
all_words = ' '.join(df['cleaned_tweet'])
print(type(all_words))  # Should be <class 'str'>
print(all_words[:500])  # Print first 500 characters to check for issues


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# Generate the WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white', collocations=False).generate(all_words)

if not all_words.strip():  # Check if string is empty
    print("Error: No valid text found for WordCloud")

# Plot the WordCloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


In [None]:
from collections import Counter

word_list = all_words.split()
word_counts = Counter(word_list)
print(word_counts.most_common(10))  # Top 10 words

In [None]:
import seaborn as sns

df['tweet_length'] = df['cleaned_tweet'].apply(lambda x: len(x.split()))

sns.histplot(df['tweet_length'], bins=30, kde=True)
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.title('Distribution of Tweet Lengths')
plt.show()

In [None]:
pip install textblob

In [None]:
from textblob import TextBlob

def get_sentiment(text):
    return TextBlob(text).sentiment.polarity  # Score between -1 (negative) to 1 (positive)

df['sentiment'] = df['cleaned_tweet'].apply(get_sentiment)

sns.histplot(df['sentiment'], bins=30, kde=True)
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.title('Tweet Sentiment Distribution')
plt.show()