# Practical - 9 : Perform sentiment analysis on Twitter data about airlines. Classify tweets into positive, negative, and neutral categories. Evaluate with accuracy, precision, recall, and F1-score.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/Tweets.csv')
print(df.head())
print(df.isnull().sum())

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['airline_sentiment'])

X = df['clean_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))


Accuracy: 0.7937158469945356
Classification Report:
               precision    recall  f1-score   support

    negative       0.82      0.93      0.87      1889
     neutral       0.64      0.49      0.55       580
    positive       0.81      0.61      0.70       459

    accuracy                           0.79      2928
   macro avg       0.76      0.68      0.71      2928
weighted avg       0.78      0.79      0.78      2928



In [None]:
# Create a DataFrame with original tweet and predicted sentiment
results_df = pd.DataFrame({
    'tweet': X_test,
    'predicted_label': le.inverse_transform(y_pred),
    'actual_label': le.inverse_transform(y_test)
})

# Display first few rows
print(results_df.head(10))


                                                   tweet predicted_label  \
4794                    your earli frontrunn best airlin        positive   
10480  flt ewr cancel flightl yet flt nyc usairway st...        negative   
8067   go bdl dca flight yesterday today everi singl ...        negative   
8880                                depart washington dc         neutral   
8292                                 probabl find ticket         neutral   
927    still wait hear back wallet stolen one plane w...        negative   
3165   ye flight rebook im lose trust want get anywhe...        negative   
7894                      thank pari could arrang someth         neutral   
2818   sure howev ticket includ one check bag therefo...        negative   
9145   great crew flight phx yvr tonight friendli eff...        positive   

      actual_label  
4794      positive  
10480     negative  
8067      negative  
8880       neutral  
8292      negative  
927       negative  
3165      negati