In [1]:
pip install pandas numpy matplotlib seaborn nltk wordcloud


Collecting wordcloud
  Downloading wordcloud-1.9.4-cp39-cp39-win_amd64.whl (300 kB)
     -------------------------------------- 300.4/300.4 kB 6.2 MB/s eta 0:00:00
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.4
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from collections import Counter
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
file_path = 'twitter_training.csv'  # Update with your dataset path
df = pd.read_csv(file_path)

# Display column names
print("Columns in Dataset:")
print(df.columns, "\n")

# Check the first few rows
print("Dataset Preview:")
print(df.head(), "\n")

# Check dataset information
print("\nDataset Info:")
print(df.info())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Drop missing values (if any)
df.dropna(inplace=True)

# Ensure the sentiment column exists and matches
# Replace 'sentiment' with the actual column name if different
if 'sentiment' not in df.columns:
    print("\nError: The 'sentiment' column does not exist. Check column names.")
else:
    # Display sentiment distribution
    plt.figure(figsize=(6, 4))
    sns.countplot(data=df, x='sentiment', palette='Set2')
    plt.title('Sentiment Distribution')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.show()

    # Tokenize tweets and clean text
    stop_words = set(stopwords.words('english'))

    def clean_text(text):
        tokens = word_tokenize(text.lower())
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        return tokens

    df['cleaned_text'] = df['text'].apply(clean_text)

    # Generate WordCloud for each sentiment
    for sentiment in df['sentiment'].unique():
        sentiment_data = df[df['sentiment'] == sentiment]
        all_words = [word for tokens in sentiment_data['cleaned_text'] for word in tokens]
        word_freq = Counter(all_words)

        wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate_from_frequencies(word_freq)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'WordCloud for {sentiment} Sentiment')
        plt.show()

    # Sentiment distribution by tweet length
    df['text_length'] = df['text'].apply(len)
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df, x='sentiment', y='text_length', palette='coolwarm')
    plt.title('Tweet Length by Sentiment')
    plt.xlabel('Sentiment')
    plt.ylabel('Tweet Length')
    plt.show()

    # Most frequent words for each sentiment
    print("\nMost Common Words by Sentiment:")
    for sentiment in df['sentiment'].unique():
        sentiment_data = df[df['sentiment'] == sentiment]
        all_words = [word for tokens in sentiment_data['cleaned_text'] for word in tokens]
        most_common_words = Counter(all_words).most_common(10)
        print(f"\nTop Words for {sentiment} Sentiment:")
        for word, freq in most_common_words:
            print(f"{word}: {freq}")

    # Sentiment-wise average tweet length
    avg_tweet_length = df.groupby('sentiment')['text_length'].mean()
    print("\nAverage Tweet Length by Sentiment:")
    print(avg_tweet_length)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Columns in Dataset:
Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object') 

Dataset Preview:
   2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

  im getting on borderlands and i will murder you all ,  
0  I am coming to the borders and I will kill you...     
1  im getting on borderlands and i will kill you ...     
2  im coming on borderlands and i will murder you...     
3  im getting on borderlands 2 and i will murder ...     
4  im getting into borderlands and i can murder y...      


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 -----------