In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('Tweets.csv')
pd.set_option('display.max_colwidth', None)  # Show full content in each cell

# Exploratory Data Analysis
print("Dataset shape:", df.shape)
print("\nColumn names:", df.columns.tolist())
print("\nSample data:")
print(df.head())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Sentiment distribution
print("\nSentiment distribution:")
sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)

# Plot sentiment distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Distribution in Tweets')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.savefig('sentiment_distribution.png')
plt.close()

# Text preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove user mentions
        text = re.sub(r'@\w+', '', text)
        
        # Remove hashtags (keep the text without #)
        text = re.sub(r'#(\w+)', r'\1', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Tokenization
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        # Rejoin tokens
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing to 'text' column
print("\nPreprocessing text data...")
df['processed_text'] = df['text'].apply(preprocess_text)

# Display a sample of preprocessed text
print("\nSample of preprocessed text:")
print(df[['text', 'processed_text']].head())

# Feature extraction using TF-IDF
print("\nExtracting features using TF-IDF...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['processed_text'])
y = df['sentiment']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Generate classification report
    report = classification_report(y_test, y_pred)
    
    # Generate confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Print results
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)
    
    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=sorted(y.unique()), 
                yticklabels=sorted(y.unique()))
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(f'confusion_matrix_{model_name.replace(" ", "_").lower()}.png')
    plt.close()
    
    return model, accuracy

# Train and evaluate models
print("\nTraining and evaluating models...")

# Naive Bayes
nb_model, nb_accuracy = evaluate_model(
    MultinomialNB(), 
    X_train, X_test, y_train, y_test, 
    "Naive Bayes"
)

# Logistic Regression
lr_model, lr_accuracy = evaluate_model(
    LogisticRegression(max_iter=1000), 
    X_train, X_test, y_train, y_test, 
    "Logistic Regression"
)

# Linear SVC
svc_model, svc_accuracy = evaluate_model(
    LinearSVC(max_iter=10000), 
    X_train, X_test, y_train, y_test, 
    "Linear SVC"
)

# Compare model performances
models = ["Naive Bayes", "Logistic Regression", "Linear SVC"]
accuracies = [nb_accuracy, lr_accuracy, svc_accuracy]

plt.figure(figsize=(10, 6))
sns.barplot(x=models, y=accuracies)
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
for i, acc in enumerate(accuracies):
    plt.text(i, acc + 0.01, f'{acc:.4f}', ha='center')
plt.savefig('model_comparison.png')
plt.close()

# Select the best model
best_model_index = accuracies.index(max(accuracies))
best_model_name = models[best_model_index]
if best_model_index == 0:
    best_model = nb_model
elif best_model_index == 1:
    best_model = lr_model
else:
    best_model = svc_model

print(f"\nThe best performing model is {best_model_name} with an accuracy of {max(accuracies):.4f}")

# Function to predict sentiment of new tweets
def predict_sentiment(tweets, model=best_model, vectorizer=tfidf_vectorizer):
    # Preprocess the tweets
    processed_tweets = [preprocess_text(tweet) for tweet in tweets]
    
    # Transform the tweets using the same vectorizer
    X_new = vectorizer.transform(processed_tweets)
    
    # Predict sentiment
    predictions = model.predict(X_new)
    
    return predictions

# Example usage: Predict sentiment of new tweets
new_tweets = [
    "I love this product, it's amazing!",
    "This is the worst experience ever, terrible service.",
    "I'm not sure how I feel about this, need more time to decide."
]

print("\nPredicting sentiment for new tweets:")
for tweet, sentiment in zip(new_tweets, predict_sentiment(new_tweets)):
    print(f"Tweet: '{tweet}'\nPredicted sentiment: {sentiment}\n")

# Feature importance analysis (for Logistic Regression)
if lr_accuracy > 0:
    print("\nAnalyzing feature importance (top 20 words for each sentiment)...")
    
    # Get feature names
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # For each sentiment, find the most important words
    for sentiment_class in sorted(y.unique()):
        if isinstance(lr_model.coef_, np.ndarray) and lr_model.coef_.shape[0] > 1:
            # For multiclass problems
            sentiment_index = np.where(lr_model.classes_ == sentiment_class)[0][0]
            coef = lr_model.coef_[sentiment_index]
        else:
            # For binary problems
            coef = lr_model.coef_[0]
            if sentiment_class != lr_model.classes_[1]:
                coef = -coef
                
        # Get top 20 words for this sentiment
        top_indices = np.argsort(coef)[-20:]
        top_words = [feature_names[i] for i in top_indices]
        top_coefs = [coef[i] for i in top_indices]
        
        # Plot
        plt.figure(figsize=(10, 8))
        sns.barplot(x=top_coefs, y=top_words)
        plt.title(f'Top 20 Words for {sentiment_class.capitalize()} Sentiment')
        plt.xlabel('Coefficient')
        plt.ylabel('Word')
        plt.tight_layout()
        plt.savefig(f'top_words_{sentiment_class}.png')
        plt.close()

print("\nSentiment analysis project completed!")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset shape: (27481, 4)

Column names: ['textID', 'text', 'selected_text', 'sentiment']

Sample data:
       textID  \
0  cb774db0d1   
1  549e992a42   
2  088c60f138   
3  9642c003ef   
4  358bd9e861   

                                                                          text  \
0                                          I`d have responded, if I were going   
1                                Sooo SAD I will miss you here in San Diego!!!   
2                                                    my boss is bullying me...   
3                                               what interview! leave me alone   
4   Sons of ****, why couldn`t they put them on the releases we already bought   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negat

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from imblearn.over_sampling import SMOTE

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('Tweets.csv')
pd.set_option('display.max_colwidth', None)

# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nSample data:")
print(df.head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Sentiment distribution
print("\nSentiment distribution:")
sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)

# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Distribution in Tweets')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.savefig('sentiment_distribution.png')
plt.close()

# Text preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove user mentions
        text = re.sub(r'@\w+', '', text)
        
        # Remove hashtags (keep the text without #)
        text = re.sub(r'#(\w+)', r'\1', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Tokenization
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        # Rejoin tokens
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing to 'text' column
print("\nPreprocessing text data...")
df['processed_text'] = df['text'].apply(preprocess_text)

# Display a sample of preprocessed text
print("\nSample of preprocessed text:")
print(df[['text', 'processed_text']].head())

# Convert sentiment labels to numerical values
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment_numeric'] = df['sentiment'].map(label_mapping)

# Split data into training and testing sets
X = df['processed_text'].values
y = df['sentiment_numeric'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# Tokenize text
print("\nTokenizing text...")
max_words = 10000  # Maximum number of words to keep
max_len = 50       # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Check class distribution
train_class_counts = np.bincount(y_train)
print(f"\nTraining class distribution: {train_class_counts}")

# Apply SMOTE for class balancing
print("\nApplying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_train_pad_reshaped = X_train_pad.reshape(X_train_pad.shape[0], -1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pad_reshaped, y_train)
X_train_resampled = X_train_resampled.reshape(X_train_resampled.shape[0], max_len)

resampled_class_counts = np.bincount(y_train_resampled)
print(f"Resampled class distribution: {resampled_class_counts}")

# Build LSTM model
print("\nBuilding LSTM model...")
embedding_dim = 200

# Create a more sophisticated model with Bidirectional LSTM
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 classes: negative, neutral, positive

# Compile model with a lower learning rate
optimizer = Adam(learning_rate=0.001)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Display model summary
model.summary()

# Define callbacks for early stopping and model checkpoint
callbacks = [
    EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True),
    ModelCheckpoint('best_lstm_model.h5', monitor='val_accuracy', save_best_only=True, mode='max')
]

# Train the model
print("\nTraining LSTM model...")
history = model.fit(
    X_train_resampled, y_train_resampled,
    validation_data=(X_test_pad, y_test),
    epochs=20,
    batch_size=128,
    callbacks=callbacks
)

# Evaluate the model
print("\nEvaluating model...")
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions
y_pred_prob = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_prob, axis=1)

# Generate classification report
print("\nClassification Report:")
target_names = ['negative', 'neutral', 'positive']
report = classification_report(y_test, y_pred, target_names=target_names)
print(report)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix - LSTM Model')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_lstm.png')
plt.close()

# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.tight_layout()
plt.savefig('training_history_lstm.png')
plt.close()

# Function to predict sentiment of new tweets
def predict_sentiment(tweets, model=model, tokenizer=tokenizer, max_len=max_len):
    # Preprocess the tweets
    processed_tweets = [preprocess_text(tweet) for tweet in tweets]
    
    # Convert to sequences
    sequences = tokenizer.texts_to_sequences(processed_tweets)
    
    # Pad sequences
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
    
    # Predict sentiment
    predictions_prob = model.predict(padded_sequences)
    predictions = np.argmax(predictions_prob, axis=1)
    
    # Map numerical predictions back to sentiment labels
    reverse_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
    sentiment_predictions = [reverse_mapping[pred] for pred in predictions]
    confidence_scores = [np.max(prob) for prob in predictions_prob]
    
    return sentiment_predictions, confidence_scores

# Example usage: Predict sentiment of new tweets
new_tweets = [
    "I love this product, it's amazing!",
    "This is the worst experience ever, terrible service.",
    "I'm not sure how I feel about this, need more time to decide."
]

print("\nPredicting sentiment for new tweets:")
sentiments, confidences = predict_sentiment(new_tweets)
for tweet, sentiment, confidence in zip(new_tweets, sentiments, confidences):
    print(f"Tweet: '{tweet}'")
    print(f"Predicted sentiment: {sentiment} (Confidence: {confidence:.4f})\n")

# Save tokenizer for future use
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("\nSentiment analysis with LSTM completed!")

# Optional: Implement an ensemble model for higher accuracy
print("\nCreating an ensemble model for even higher accuracy...")

from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# Prepare traditional ML features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train traditional ML models
print("Training traditional ML models for ensemble...")
nb_model = MultinomialNB().fit(X_train_tfidf, y_train)
lr_model = LogisticRegression(max_iter=1000).fit(X_train_tfidf, y_train)
svc_model = LinearSVC(max_iter=10000).fit(X_train_tfidf, y_train)

# Get LSTM predictions for training set
lstm_train_preds = model.predict(X_train_pad)
lstm_train_preds_class = np.argmax(lstm_train_preds, axis=1)

# Create a meta-classifier
meta_features_train = np.column_stack([
    nb_model.predict_proba(X_train_tfidf)[:, 1],  # NB probability for class 1
    lr_model.decision_function(X_train_tfidf),    # LR decision function
    lstm_train_preds                             # LSTM probabilities
])

meta_classifier = LogisticRegression(max_iter=1000)
meta_classifier.fit(meta_features_train, y_train)

# Make predictions with all models on test set
nb_preds = nb_model.predict_proba(X_test_tfidf)[:, 1]
lr_preds = lr_model.decision_function(X_test_tfidf)
lstm_preds = model.predict(X_test_pad)

# Combine predictions for meta-classifier
meta_features_test = np.column_stack([nb_preds, lr_preds, lstm_preds])
ensemble_preds = meta_classifier.predict(meta_features_test)

# Evaluate ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_preds)
print(f"\nEnsemble Model Accuracy: {ensemble_accuracy:.4f}")

# Generate classification report for ensemble
print("\nEnsemble Classification Report:")
ensemble_report = classification_report(y_test, ensemble_preds, target_names=target_names)
print(ensemble_report)

# Generate confusion matrix for ensemble
ensemble_cm = confusion_matrix(y_test, ensemble_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(ensemble_cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix - Ensemble Model')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_ensemble.png')
plt.close()

print("\nSentiment analysis project with LSTM and ensemble modeling completed!")

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self-signed certificate in certificate chain
[nltk_data]     (_ssl.c:1006)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self-signed certificate in certificate chain
[nltk_data]     (_ssl.c:1006)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     self-signed certificate in certificate chain
[nltk_data]     (_ssl.c:1006)>


Loading dataset...
Dataset shape: (27481, 4)

Sample data:
       textID  \
0  cb774db0d1   
1  549e992a42   
2  088c60f138   
3  9642c003ef   
4  358bd9e861   

                                                                          text  \
0                                          I`d have responded, if I were going   
1                                Sooo SAD I will miss you here in San Diego!!!   
2                                                    my boss is bullying me...   
3                                               what interview! leave me alone   
4   Sons of ****, why couldn`t they put them on the releases we already bought   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  

Missing values:
textID           0
tex

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from imblearn.over_sampling import SMOTE
import pickle

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('Tweets.csv')
pd.set_option('display.max_colwidth', None)

# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nSample data:")
print(df.head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Sentiment distribution
print("\nSentiment distribution:")
sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)

# Plot sentiment distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Distribution in Tweets')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.savefig('sentiment_distribution.png')
plt.close()

# Text preprocessing function
def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove user mentions
        text = re.sub(r'@\w+', '', text)
        
        # Remove hashtags (keep the text without #)
        text = re.sub(r'#(\w+)', r'\1', text)
        
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # Tokenization
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        # Rejoin tokens
        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing to 'text' column
print("\nPreprocessing text data...")
df['processed_text'] = df['text'].apply(preprocess_text)

# Display a sample of preprocessed text
print("\nSample of preprocessed text:")
print(df[['text', 'processed_text']].head())

# Convert sentiment labels to numerical values
label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
df['sentiment_numeric'] = df['sentiment'].map(label_mapping)

# Split data into training and testing sets
X = df['processed_text'].values
y = df['sentiment_numeric'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

# Tokenize text
print("\nTokenizing text...")
max_words = 10000  # Maximum number of words to keep
max_len = 50       # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading dataset...
Dataset shape: (27481, 4)

Sample data:
       textID  \
0  cb774db0d1   
1  549e992a42   
2  088c60f138   
3  9642c003ef   
4  358bd9e861   

                                                                          text  \
0                                          I`d have responded, if I were going   
1                                Sooo SAD I will miss you here in San Diego!!!   
2                                                    my boss is bullying me...   
3                                               what interview! leave me alone   
4   Sons of ****, why couldn`t they put them on the releases we already bought   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  

Missing values:
textID           0
tex