In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("tweet_emotion.csv")  

In [None]:
print(df.info())
print(df.head())

In [None]:
data = pd.read_csv("tweet_emotion.csv")

# Ensure 'tweet_text' column contains valid strings
data['tweet_text'] = data['tweet_text'].fillna("").astype(str)

# Add a text_length column after cleaning
data['text_length'] = data['tweet_text'].apply(len)

# Drop missing values (other columns if necessary)
data.dropna(inplace=True)

# Text length histogram
plt.figure(figsize=(10, 6))
sns.histplot(data['text_length'], bins=30, kde=True, color='skyblue')
plt.title("Text Length Distribution")
plt.xlabel("Text Length")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Identify short and long texts
short_texts = data[data['text_length'] < 5]
long_texts = data[data['text_length'] > np.percentile(data['text_length'], 95)]

# Print counts of short and long texts
short_count = len(short_texts)
long_count = len(long_texts)

# Filter dataset to remove outliers
length_threshold = np.percentile(data['text_length'], 95)
filtered_data = data[(data['text_length'] >= 5) & (data['text_length'] <= length_threshold)]

# Analyze emotion distribution
emotion_counts = filtered_data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

# Emotion bar chart
plt.figure(figsize=(12, 6))
sns.barplot(x=emotion_counts.index, y=emotion_counts.values, palette="pastel")
plt.title("Emotion Distribution in Dataset")
plt.xlabel("Emotion")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()


In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(" ".join(filtered_data['tweet_text']))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Most Frequent Words in Dataset")
plt.show()

# Train-test split
train_data, test_data = train_test_split(filtered_data, test_size=0.2, random_state=42)

# Results summary
short_count, long_count, filtered_data.shape, train_data.shape, test_data.shape


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split


In [None]:
data = pd.read_csv("tweet_emotion.csv")
data['tweet_text'] = data['tweet_text'].fillna("").astype(str)
data['text_length'] = data['tweet_text'].apply(len)
length_threshold = np.percentile(data['text_length'], 95)
filtered_data = data[(data['text_length'] >= 5) & (data['text_length'] <= length_threshold)]


In [None]:
train_data, test_data = train_test_split(filtered_data, test_size=0.2, random_state=42)


In [None]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = tfidf.fit_transform(train_data['tweet_text'])
X_test = tfidf.transform(test_data['tweet_text'])

In [None]:
y_train = train_data['is_there_an_emotion_directed_at_a_brand_or_product']
y_test = test_data['is_there_an_emotion_directed_at_a_brand_or_product']

In [None]:
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n")
print(report)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # You can adjust max_features based on your needs

# Fit and transform the training data and transform the test data
X_train_tfidf = tfidf.fit_transform(train_data['tweet_text'])
X_test_tfidf = tfidf.transform(test_data['tweet_text'])

# The target variable
y_train = train_data['is_there_an_emotion_directed_at_a_brand_or_product']
y_test = test_data['is_there_an_emotion_directed_at_a_brand_or_product']


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Initialize the SVM model (you can tune hyperparameters like kernel, C, etc.)
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

#  Convert Text to TF-IDF Features
tfidf = TfidfVectorizer(max_features=1000)  # Adjust max_features based on your dataset size

# Fit and transform the training data and transform the test data
X_train_tfidf = tfidf.fit_transform(train_data['tweet_text'])
X_test_tfidf = tfidf.transform(test_data['tweet_text'])

# The target variable
y_train = train_data['is_there_an_emotion_directed_at_a_brand_or_product']
y_test = test_data['is_there_an_emotion_directed_at_a_brand_or_product']

#  Train the Naive Bayes Model
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train_tfidf, y_train)

#  Make Predictions and Evaluate
y_pred = nb_model.predict(X_test_tfidf)

# Print the class distribution in the training data
print(train_data['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts())

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

#  Load the Dataset
data = pd.read_csv("tweet_emotion.csv")  # Replace with your dataset's path

#  Preprocess the Data
data['tweet_text'] = data['tweet_text'].fillna("").astype(str)  # Ensure valid strings
data.dropna(inplace=True)  # Drop any remaining missing values if necessary

#  Train-Test Split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_train_tfidf = tfidf.fit_transform(train_data['tweet_text'])
X_test_tfidf = tfidf.transform(test_data['tweet_text'])

#  Target Variable
y_train = train_data['is_there_an_emotion_directed_at_a_brand_or_product']
y_test = test_data['is_there_an_emotion_directed_at_a_brand_or_product']

#  Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust n_estimators as needed
rf_model.fit(X_train_tfidf, y_train)

#  Make Predictions and Evaluate
y_pred = rf_model.predict(X_test_tfidf)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

#  Load the Dataset
data = pd.read_csv("tweet_emotion.csv")  # Replace with your dataset's path

#  Preprocess the Data
data['tweet_text'] = data['tweet_text'].fillna("").astype(str)  # Ensure valid strings
data.dropna(inplace=True)  # Drop any remaining missing values if necessary

#  Train-Test Split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

#  TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf.fit_transform(train_data['tweet_text'])
X_test_tfidf = tfidf.transform(test_data['tweet_text'])

#  Target Variable
y_train = train_data['is_there_an_emotion_directed_at_a_brand_or_product']
y_test = test_data['is_there_an_emotion_directed_at_a_brand_or_product']

#  Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train_tfidf, y_train)

#  Make Predictions and Evaluate
y_pred = gb_model.predict(X_test_tfidf)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
