# Sentiment Analysis

## Imports

In [130]:
import pandas as pd
## Three models to be used
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics 

## import data set
data = pd.read_csv("Tweets.csv")

## EDA

In [110]:
data.columns

Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')

In [111]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [112]:
data.nunique()

tweet_id                        14485
airline_sentiment                   3
airline_sentiment_confidence     1023
negativereason                     10
negativereason_confidence        1410
airline                             6
airline_sentiment_gold              3
name                             7701
negativereason_gold                13
retweet_count                      18
text                            14427
tweet_coord                       832
tweet_created                   14247
tweet_location                   3081
user_timezone                      85
dtype: int64

## Cleaning Data

In [113]:
import nltk;
from nltk.corpus import stopwords;
from nltk import word_tokenize;
import string;
from nltk.stem.snowball import SnowballStemmer;

In [114]:
def cleanText(text) : 
    stemmer = SnowballStemmer(language = "english")
    # tokenize
    tokens = nltk.word_tokenize(text.lower())
    #stem
    stemmed_words = []
    for word in tokens:
        
        stemmedWord = stemmer.stem(word)
        stemmed_words.append(stemmedWord)
    # filter characters    
    stop = set(stopwords.words('english') + list(string.punctuation))
    filtered_words = [i for i in stemmed_words if i not in stop]
    filtered_words = [i for i in filtered_words if not i.isnumeric()]
    filtered_words = [i for i in filtered_words if i.isalpha()]
    
    return " ".join(filtered_words)

In [115]:
## Making encoded labels 
from sklearn.preprocessing import LabelEncoder

airline_sentiment = pd.DataFrame(data["airline_sentiment"])
le = LabelEncoder()
le.fit(airline_sentiment)
labels = le.transform(airline_sentiment)

# le.inverse_transform(labels)

  return f(*args, **kwargs)


In [116]:
data["text"] = data["text"].apply(lambda x: cleanText(x));

In [119]:
x = data["text"]
y = labels

In [120]:
from sklearn.feature_extraction.text import TfidfVectorizer

## Train test split

In [125]:
from sklearn.model_selection import train_test_split

## converting strings to integers
vectorizer = TfidfVectorizer()
vectorized = vectorizer.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(vectorized, y, test_size=0.7)

# Training models

## AdaBoostClassifier

In [127]:
classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=4),
    n_estimators=200
)
classifier.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),
                   n_estimators=200)

In [128]:
output = classifier.predict(X_test)

In [129]:

metrics.accuracy_score(output,  y_test)

0.6813036690085871

## BaggingClassifier

In [132]:
classifier = BaggingClassifier(
     base_estimator=SVC(),
     n_estimators=10,
)
classifier.fit(X_train, y_train)

BaggingClassifier(base_estimator=SVC())

In [133]:
output = classifier.predict(X_test)

In [134]:
metrics.accuracy_score(output,  y_test)

0.7481459797033567

## RandomForestClassifier

In [135]:
classifier = RandomForestClassifier(
    max_depth=2, 
    random_state=0
)
classifier.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [136]:
output = classifier.predict(X_test)

In [137]:
metrics.accuracy_score(output,  y_test)

0.6291959406713505

# Conclusion

The most accurate classifier I tried was the bagging classifier.