# NLP Challenge: Twitter Sentiment Analysis

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
# Import the machine learning model of your choice
import xgboost as xgb
from xgboost.sklearn import XGBClassifier 

# Step 1: Load the Sentiment140 dataset

In [None]:
# Download the dataset from Kaggle and specify the file path


In [None]:
df.columns = ["label", "id", "date", "query", "user_name", "comment"]

In [None]:
df = df.drop(columns=["id", "date", "query", "user_name"])

In [None]:
df['label'] = df['label'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
df.head()

In [None]:
df.label.value_counts()

# Step 2: Data Preprocessing

In [None]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

# Function to remove mentions (including the "@" symbol and the username)
def remove_usernames_links(tweet):
    tweet = re.sub('@[^\s]+', '', tweet)
    tweet = re.sub('http[^\s]+', '', tweet)
    return tweet

# Function to make text lowercase
def make_lowercase(text):
    return text.lower() if text else text

# Remove rows with NaN and filter by comment length
df = df.dropna()
df = df[df["comment"].str.len() > 50]

# Apply the make_lowercase function to convert text to lowercase
df['comment'] = df['comment'].apply(make_lowercase)

# Remove special characters from comments
def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z\s]', '', text) if text else text

# Apply the remove_mentions function to remove mentions
df['comment'] = df['comment'].apply(remove_usernames_links)

# Apply the remove_special_characters function to clean the comments
df['comment'] = df['comment'].apply(remove_special_characters)

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

df['comment'] = df['comment'].apply(lemmatize_text)

df.head()

# Step 3: Feature Extraction

In [None]:
# Choose a feature extraction method (e.g., TF-IDF, Gensim, or a pretrained language model) and transform the text data into numerical features.
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(df['comment'])
y = df['label']

# Step 4: Model Selection and Training

In [None]:
# Split the data into a training-validation set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training-validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Choose a machine learning model (e.g., Logistic Regression) and train it
model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    
)

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = model.predict(X_val)

# Evaluate the model's performance on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.2f}")


In [None]:
print(classification_report(y_val, y_val_pred))

In [None]:
confusion_matrix(y_val, y_val_pred)

# Step 5: Sentiment Analysis

In [None]:
# Perform sentiment analysis on the dataset using your trained model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

# Step 6: Visualizations

In [None]:
# Select an AI company or product of your choice and collect tweets related to it
# Use your trained model to predict sentiment on these tweets
# Create visualizations to showcase sentiment (e.g., bar charts, word clouds)

# Example: 
# - Visualize sentiment distribution using seaborn or matplotlib.
# - Create word clouds for positive and negative tweets.
# - Generate a bar chart showing sentiment scores for the chosen company/product.

# Additional Tips:
# - Experiment with hyperparameter tuning to improve model performance.
# - Use cross-validation for a more robust evaluation.
# - Write functions to encapsulate repetitive tasks and improve code organization.

new_df =df[df['comment'].str.contains("amazon")]
new_df

In [None]:
X = vectorizer.transform(new_df['comment'])
y_true = new_df['label'].values 
y_pred = model.predict(X)
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Evaluation

In [None]:
# Evaluate your model's performance using metrics like accuracy, precision, recall, and F1-score.

# Example:
# - accuracy = accuracy_score(y_test, y_pred)
print(classification_report(y_true, y_pred))
confusion_matrix(y_true, y_pred)


In [None]:
neg_df = new_df[new_df["label"] == 0]
pos_df = new_df[new_df["label"] == 1]
data = {"Sentiment": ["Positive", "Negative"], "Count": [len(pos_df), len(neg_df)]}
sentiment_df = pd.DataFrame(data)

print(sentiment_df)

In [None]:
display(neg_df.head())
display(pos_df.head())

In [None]:
corpus = " ".join(new_df["comment"].astype(str))
neg_corpus = " ".join(neg_df["comment"].astype(str))
pos_corpus = " ".join(pos_df["comment"].astype(str))
from pprint import pprint
pprint(corpus)

In [None]:
from wordcloud import WordCloud
wordcloud = WordCloud().generate(corpus)

# Display the generated image:

import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")



In [None]:
wordcloud = WordCloud().generate(neg_corpus)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

In [None]:
wordcloud = WordCloud().generate(pos_corpus)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

In [None]:
from collections import Counter
import nltk
from nltk.corpus import stopwords

# Counting mosts common words and removing stop words
stop_words = set(stopwords.words('english'))
words = re.findall(r'\w+', corpus)
filtered_words = [w for w in words if not w.lower() in stop_words]

word_freq = Counter(filtered_words)
most_common_words = word_freq.most_common(20)
df = pd.DataFrame(most_common_words, columns=['Word', 'Frequency'])
df = df[df["Word"] != "amazon"]

In [None]:
df.head(20)

In [None]:
# Word Frequency

import plotly.express as px
fig = px.bar(df, x="Word", y="Frequency", title="Word Frequency", hover_name="Word", color_discrete_sequence=["mediumslateblue"])
fig.update_layout(
    title_font_color="mediumslateblue",
    plot_bgcolor="black",
    paper_bgcolor="black",
    xaxis=dict(
        color="white",
        title_font=dict(color="mediumslateblue"),
        tickfont=dict(color="mediumslateblue"),
    ),
    yaxis=dict(
        color="white",
        title_font=dict(color="mediumslateblue"),
        tickfont=dict(color="mediumslateblue"),
        showgrid=False,
    ),
)

fig.show()

In [None]:
# Negative VS Positive Sentiment viz

fig2 = px.bar(sentiment_df, x="Sentiment", y="Count", title="Negative VS Positive Sentiment, Amazon Tweets", hover_name="Sentiment", color_discrete_sequence=["mediumslateblue"])
fig2.update_layout(
    title_font_color="mediumslateblue",
    plot_bgcolor="black",
    paper_bgcolor="black",
    xaxis=dict(
        color="white",
        title_font=dict(color="mediumslateblue"),
        tickfont=dict(color="mediumslateblue"),
    ),
    yaxis=dict(
        color="white",
        title_font=dict(color="mediumslateblue"),
        tickfont=dict(color="mediumslateblue"),
        showgrid=False,
    ),
)

fig2.show()

---