<a href="https://colab.research.google.com/github/manushadananjaya/Sentiment_Analysis_using_NLP/blob/manu-dev-new/Sentiment_Analysis_using_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

df = pd.read_csv('Tweets.csv')

# Drop unnecessary columns
df = df[["airline_sentiment", "text"]]


In [5]:
import nltk
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')

# Initialize Porter Stemmer
ps = PorterStemmer()

# Define clean_text function
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)  # Remove URLs
    text = nltk.word_tokenize(text)  # Tokenize text
    y = []
    for i in text:
        if i not in stopwords.words('english'):  # Remove stopwords
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))  # Apply stemming
    return " ".join(y)

# Apply clean_text function to text column
df['text_cleaned'] = df['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_vec = TfidfVectorizer(max_features=3000)
X = tf_vec.fit_transform(df['text_cleaned']).toarray()

In [14]:
# Applying clean_text function to the first tweet
cleaned_text = clean_text(df['text'][0])
print(cleaned_text)


@ virginamerica @ dhepburn said .


In [12]:
sentiment_counts = df.groupby("airline_sentiment").describe()

# Printing the size of each group
print(sentiment_counts)

                   text         \
                  count unique   
airline_sentiment                
negative           9178   9087   
neutral            3099   3067   
positive           2363   2298   

                                                                           \
                                                                 top freq   
airline_sentiment                                                           
negative           @AmericanAir that's 16+ extra hours of travel ...    2   
neutral                                           @SouthwestAir sent    5   
positive                                            @JetBlue thanks!    5   

                  text_cleaned         \
                         count unique   
airline_sentiment                       
negative                  9178   9083   
neutral                   3099   3025   
positive                  2363   2262   

                                                                           
         

In [8]:
df.shape

(14640, 3)

In [7]:
Y = df['airline_sentiment'].values

In [9]:
df.shape

(14640, 3)

In [13]:
sentiment_counts = df.groupby("airline_sentiment").describe()

# Printing the size of each group
print(sentiment_counts)

                   text         \
                  count unique   
airline_sentiment                
negative           9178   9087   
neutral            3099   3067   
positive           2363   2298   

                                                                           \
                                                                 top freq   
airline_sentiment                                                           
negative           @AmericanAir that's 16+ extra hours of travel ...    2   
neutral                                           @SouthwestAir sent    5   
positive                                            @JetBlue thanks!    5   

                  text_cleaned         \
                         count unique   
airline_sentiment                       
negative                  9178   9083   
neutral                   3099   3025   
positive                  2363   2262   

                                                                           
         

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Splitting the dataset into training and testing sets
X = df['text_cleaned']  # Features
y = df['airline_sentiment']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# Vectorizing the text data
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Training a multinomial Naïve Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

# Predicting the sentiment of tweets in the test dataset
nb_y_pred = nb_classifier.predict(X_test_counts)

# Finding the accuracy of the Naïve Bayes model
nb_accuracy = accuracy_score(y_test, nb_y_pred)
print("Multinomial Naïve Bayes Classifier Accuracy:", nb_accuracy)

# Training a Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_counts, y_train)

# Predicting the sentiment of tweets in the test dataset using Random Forest classifier
rf_y_pred = rf_classifier.predict(X_test_counts)

# Finding the accuracy of the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Random Forest Classifier Accuracy:", rf_accuracy)


Multinomial Naïve Bayes Classifier Accuracy: 0.7513661202185792
Random Forest Classifier Accuracy: 0.7564890710382514
