# Using Sentiment Analysis to Determine Airline Sentiment on Twitter

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

In [None]:
# Reading in the CSV file
df1 = pd.read_csv('/content/drive/MyDrive/CSC402/Chapter16/Sentiment_Analysis/Tweets.csv')
df1.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
df2 = df1

In [None]:
airlines = df1.airline.str.lower().unique().tolist()
airlines

['virgin america', 'united', 'southwest', 'delta', 'us airways', 'american']

In [None]:
# Dropping unneeded columns - data cleaning
df2 = df2[['airline_sentiment', 'text']]
df2

Unnamed: 0,airline_sentiment,text
0,neutral,@virginamerica what @dhepburn said.
1,positive,@virginamerica plus you have added commercials...
2,neutral,@virginamerica i did not today... must mean i ...
3,negative,@virginamerica it is really aggressive to blas...
4,negative,@virginamerica and it is a really big bad thin...
...,...,...
14635,positive,@americanair thank you we got on a different f...
14636,negative,@americanair leaving over 20 minutes late flig...
14637,neutral,@americanair please bring american airlines to...
14638,negative,"@americanair you have my money, you change my ..."


In [None]:
df2['text'] = df2['text'].str.lower()
df2.text

0                      @virginamerica what @dhepburn said.
1        @virginamerica plus you have added commercials...
2        @virginamerica i did not today... must mean i ...
3        @virginamerica it is really aggressive to blas...
4        @virginamerica and it is a really big bad thin...
                               ...                        
14635    @americanair thank you we got on a different f...
14636    @americanair leaving over 20 minutes late flig...
14637    @americanair please bring american airlines to...
14638    @americanair you have my money, you change my ...
14639    @americanair we have 8 ppl so we need 2 know h...
Name: text, Length: 14640, dtype: object

In [None]:
df2.airline_sentiment = df2.airline_sentiment.str.lower()
df2.airline_sentiment

0         neutral
1        positive
2         neutral
3        negative
4        negative
           ...   
14635    positive
14636    negative
14637     neutral
14638    negative
14639     neutral
Name: airline_sentiment, Length: 14640, dtype: object

In [None]:
contractions_dict = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "couldn't": "could not",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'll": "he will",
  "here's": "here is",
  "how's": "how is",
  "i'd": "i would",
  "i'll": "i will",
  "i'm": "i am",
  "isn't": "is not",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mightn't": "might not",
  "mustn't": "must not",
  "shan't": "shall not",
  "she'd": "she would",
  "she'll": "she will",
  "that's": "that is",
  "there's": "there is",
  "they'd": "they would",
  "they're": "they are",
  "they've": "they have",
  "this's": "this is",
  "wasn't": "was not",
  "we'd": "we would",
  "we'll": "we will",
  "weren't": "were not",
  "what's": "what is",
  "when's": "when is",
  "where's": "where is",
  "who's": "who is",
  "why's": "why is",
  "won't": "will not",
  "you'd": "you would",
  "you'll": "you will",
  "you're": "you are",
    "you've": "you have"
}

def expand_contractions(text, contractions_dict):
  """
  Expands contractions in text using a dictionary.

  Args:
    text: The text to be processed.
    contractions_dict: A dictionary of contractions and their expansions.

  Returns:
    The expanded text.
  """
  expanded_text = []
  for word in text.split():
    if word.lower() in contractions_dict:
      expanded_text.append(contractions_dict[word.lower()]) # if contraction in text, applies dictionary value
      # And makes it all lowercase
    else:
      expanded_text.append(word)
  return " ".join(expanded_text)

In [None]:
df2['text'] = df2['text'].apply(expand_contractions, args=(contractions_dict,))
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['text'] = df2['text'].apply(expand_contractions, args=(contractions_dict,))


Unnamed: 0,airline_sentiment,text
0,neutral,@virginamerica what @dhepburn said.
1,positive,@virginamerica plus you have added commercials...
2,neutral,@virginamerica i did not today... must mean i ...
3,negative,@virginamerica it is really aggressive to blas...
4,negative,@virginamerica and it is a really big bad thin...
...,...,...
14635,positive,@americanair thank you we got on a different f...
14636,negative,@americanair leaving over 20 minutes late flig...
14637,neutral,@americanair please bring american airlines to...
14638,negative,"@americanair you have my money, you change my ..."


In [None]:
# Tokenize the words
!pip install nltk
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
df2['text_tokened'] = df2['text'].apply(word_tokenize)
df2



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,airline_sentiment,text,text_tokened
0,neutral,@virginamerica what @dhepburn said.,"[@, virginamerica, what, @, dhepburn, said, .]"
1,positive,@virginamerica plus you have added commercials...,"[@, virginamerica, plus, you, have, added, com..."
2,neutral,@virginamerica i did not today... must mean i ...,"[@, virginamerica, i, did, not, today, ..., mu..."
3,negative,@virginamerica it is really aggressive to blas...,"[@, virginamerica, it, is, really, aggressive,..."
4,negative,@virginamerica and it is a really big bad thin...,"[@, virginamerica, and, it, is, a, really, big..."
...,...,...,...
14635,positive,@americanair thank you we got on a different f...,"[@, americanair, thank, you, we, got, on, a, d..."
14636,negative,@americanair leaving over 20 minutes late flig...,"[@, americanair, leaving, over, 20, minutes, l..."
14637,neutral,@americanair please bring american airlines to...,"[@, americanair, please, bring, american, airl..."
14638,negative,"@americanair you have my money, you change my ...","[@, americanair, you, have, my, money, ,, you,..."


In [None]:
def replace_with_person_or_airline(text_tokens, airlines_dict):
    for i in range(len(text_tokens)):
        if text_tokens[i] == '@' and text_tokens[i+1] in airlines_dict:
            text_tokens[i+1] = "airline"
            i += 2  # Skip next word as well
        elif text_tokens[i] == '@':
            text_tokens[i+1] = "person"
            i += 2  # Skip next word as well
    return text_tokens

In [None]:
df2['cleaned'] = df2['text_tokened'].apply(replace_with_person_or_airline,args=(airlines,))
df2['cleaned']
# Tokenizes and replaces names with person and airline names with airline

0                    [@, person, what, @, person, said, .]
1        [@, person, plus, you, have, added, commercial...
2        [@, person, i, did, not, today, ..., must, mea...
3        [@, person, it, is, really, aggressive, to, bl...
4        [@, person, and, it, is, a, really, big, bad, ...
                               ...                        
14635    [@, person, thank, you, we, got, on, a, differ...
14636    [@, person, leaving, over, 20, minutes, late, ...
14637    [@, person, please, bring, american, airlines,...
14638    [@, person, you, have, my, money, ,, you, chan...
14639    [@, person, we, have, 8, ppl, so, we, need, 2,...
Name: cleaned, Length: 14640, dtype: object

In [None]:
df2['cleaned']= df2['cleaned'].apply(lambda x: [y for y in x if y != '@'])
df2['cleaned']
# Ignores the @ symbol

0                          [person, what, person, said, .]
1        [person, plus, you, have, added, commercials, ...
2        [person, i, did, not, today, ..., must, mean, ...
3        [person, it, is, really, aggressive, to, blast...
4        [person, and, it, is, a, really, big, bad, thi...
                               ...                        
14635    [person, thank, you, we, got, on, a, different...
14636    [person, leaving, over, 20, minutes, late, fli...
14637    [person, please, bring, american, airlines, to...
14638    [person, you, have, my, money, ,, you, change,...
14639    [person, we, have, 8, ppl, so, we, need, 2, kn...
Name: cleaned, Length: 14640, dtype: object

In [None]:
from keras.preprocessing.text import Tokenizer

# Create a tokenizer
tokenizer = Tokenizer() # Creating a Tokenizer object
tokenizer.fit_on_texts(df2['cleaned']) # Takes cleaned versions and tokenizes that
# puts 'unknown' and padding into sentences, to make sentences the same length
# Replaces unknown words

# Convert text to sequences of numbers
sequences = tokenizer.texts_to_sequences(df2['cleaned']) # Putting text into sequences of integers
vocab_size = len(tokenizer.index_word) # Vocab size is the length of
# index word = dictionary that maps integer indices to corresponding word strings in
# Tokenizer obkect
# Maps numerical representations of words back to actual words they represent
# Fits its own vocab with your words
vocab_size # How many unique words are in your dataset

16223

In [None]:
from keras.utils import pad_sequences
sequences = pad_sequences(sequences, padding='pre') # Adding padding to the beginning
# Making it so all the sequences have the same shape (by adding 0's)

sequences.shape


(14640, 46)

Supervised or unsupervised dataset?

- It's supervised!
  - Because it's labeled.

- To turn this into an algorithm, we must change the labels into numbers
  - Label encoding
    - Model can't work with categorical data (words), so we turn it into numbers


**Label encoding:**
- Makes three (here) columns
- 0 = neutral
- 1 = positive
- 2 = negative


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder() # Have a class, make an object

# Apply LabelEncoder to 'airline sentiment'
df2.airline_sentiment_labelencoded = le.fit_transform(df2.airline_sentiment)

  df2.airline_sentiment_labelencoded = le.fit_transform(df2.airline_sentiment)


- Our data is clean and ready for ML!
- All numbers and no NULL values
- All padded and have the same dimensions

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sequences, df2.airline_sentiment_labelencoded, test_size=0.2, random_state=42)


- X = sequences
  - **Input**
  -
- Y = df2.airline_sentiment_labelencoded
  - Based on the text, we're predicting the sentiment
  - Whatever you predict is your Y
  - **Output**

**test_size = 0.2**
  - 20% is test
  - 80% train
- Default is %25 - %75

**Random States = 42**
- Machine wants to shuffled set of train and set
  - Shuffles to get examples from every set
  - Want a good distribution


input_dim = size of vocabulary

input_length = 46 words
  - If the tweet was shorter, it was padded
  - If it was longer, it was truncated

output_dim = 2
- Why 2? 2D space
- Putting the words into a coordinate space and giving them coordinates as if they were numbers


In [None]:
from keras.preprocessing.sequence import pad_sequences # Pad sequences to fixed length
from keras.models import Sequential # Create linear stack of layers
from keras.layers import Embedding, Dense, Bidirectional, SimpleRNN,GRU,LSTM,Dropout
# Embedding represents words as vectors
# Dense creates fully-connected layer
# Bidirectional applies a layer to both forward and backward sequence
  # Neurons are bidirectional: one output talks to other's input (doesn't have to be only forward)
# Dropout randomly drops units when training to prevent overfitting

model = Sequential([
    Embedding(input_dim=16465,output_dim=2,input_length=46), # Vector size of 2
    Bidirectional(SimpleRNN(53)), # Applies simple RNN to both forward and backward versions of input sequence
    Dropout(0.5), # Drops 50% of units (neurons), not a layer
    Dense(12,activation='relu'), # 2 dense layers
    Dense(3,activation='softmax') # These Dense layers are hidden, softmax works well with categorization
    # Last layer Dense = categorizing into 3 types (0, 1, 2 = neutral, positive, negative)
])
# SimpleRNN(53) = a layer with 53 units (neurons)

#compiling model
model.compile(optimizer="rmsprop",loss='sparse_categorical_crossentropy',metrics=['accuracy'])
# Use rmsprop here, not Ndam
# Categorization, so use accuracy as metric

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 46, 2)             32930     
                                                                 
 bidirectional_1 (Bidirecti  (None, 106)               5936      
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 106)               0         
                                                                 
 dense_2 (Dense)             (None, 12)                1284      
                                                                 
 dense_3 (Dense)             (None, 3)                 39        
                                                                 
Total params: 40189 (156.99 KB)
Trainable params: 40189 (156.99 KB)
Non-trainable params: 0 (0.00 Byte)
________________

- :-200 = getting from end
- Accuracy score = ratio of true guesses
- Predicting either 0, 1, 2
  - Want to convert them to the class labels

- Sending X & y train to RNN model
- once you have your ML model, you send the x test
  - Gives you out y predicted
  - y_test = golden true values that ML didn't see before

- **To find out how well your RNN model did, you need to compare y_text to y_predicted. y_test is like solution key, whereas y_predicted is YOUR answers to the exam.**
  - X_train, y_train are the sample review questions for the exam

In [None]:
history = model.fit(X_train, y_train, epochs=16, validation_data= (X_test[:200], y_test[:200]))

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


In [None]:
# Predict on the test data
predictions = model.predict(X_test[-200:])

# Now evaluate the model using metrics like accuracy, precision, recall, etc.
from sklearn.metrics import accuracy_score, classification_report

# Convert predictions to class labels (assuming predictions are in one-hot encoded format)
predicted_labels = predictions.argmax(axis=1)

# Preditions are probabilities
# There's three classes, so it finds the class with the highest probability
# [0.1, 0.2, 0.3]
# [0.2, 0.3, 0.4] So here, [class 1, class2, class3]
# Here, the second sample has all the highest probabilities for each class

# Calculate accuracy
accuracy = accuracy_score(y_test[-200:], predicted_labels) # GRADING, COMPARING YOUR ANSWERS TO SOLUTION KEY
print(f"Accuracy: {accuracy}")

Accuracy: 0.795


In [None]:
# Trying to do a prediction with one tweet

import numpy as np

tweet = 'This airline is so fun'

sequence = tokenizer.texts_to_sequences([tweet])

padded_sequence = pad_sequences(sequence, maxlen=46)

prediction = model.predict(padded_sequence)

predicted_class = np.argmax(prediction, axis=1)

print('Predicted class: ', predicted_class)

Predicted class:  [1]
