In [1]:
import pandas as pd

In [2]:
data = pd.read_csv(r'Tweets_analysis.csv', header=0)  # load dataset

In [3]:
data.head() # display top 5 rows

Unnamed: 0,sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [4]:
data.info() # gives info about null values and data type of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  14640 non-null  object
 1   text       14640 non-null  object
dtypes: object(2)
memory usage: 228.9+ KB


In [5]:
data.isnull().sum()  # there is no null value 

sentiment    0
text         0
dtype: int64

In [6]:
data.dtypes # data type in each column

sentiment    object
text         object
dtype: object

In [7]:
data.shape  # rows and columns

(14640, 2)

convert the labels('positive','negative','neutral') into numbers using LabelEncoding

In [8]:
colname = ['sentiment'] # list having only categorical columns

from sklearn.preprocessing import LabelEncoder # import LabelEncoder function from preprocessing sublibrary
le=LabelEncoder()                              # save LabelEncoder function in a variable le
for x in colname:                             
    data[x]=le.fit_transform(data[x]) # it assigns numbers to all values of categorical column
    le_name_mapping = dict(zip(le.classes_,le.transform(le.classes_)))  # represent in a dictionary
    print('Feature',x)
    print('mapping',le_name_mapping)

Feature sentiment
mapping {'negative': 0, 'neutral': 1, 'positive': 2}


In [9]:
data.head()

Unnamed: 0,sentiment,text
0,1,@VirginAmerica What @dhepburn said.
1,2,@VirginAmerica plus you've added commercials t...
2,1,@VirginAmerica I didn't today... Must mean I n...
3,0,@VirginAmerica it's really aggressive to blast...
4,0,@VirginAmerica and it's a really big bad thing...


In [10]:
#sentiment count: 
data['sentiment'].value_counts() 

0    9178
1    3099
2    2363
Name: sentiment, dtype: int64

We can balance our dataset if the accuracy of our model on testing data is not good

In [11]:
data['text']

0                      @VirginAmerica What @dhepburn said.
1        @VirginAmerica plus you've added commercials t...
2        @VirginAmerica I didn't today... Must mean I n...
3        @VirginAmerica it's really aggressive to blast...
4        @VirginAmerica and it's a really big bad thing...
                               ...                        
14635    @AmericanAir thank you we got on a different f...
14636    @AmericanAir leaving over 20 minutes Late Flig...
14637    @AmericanAir Please bring American Airlines to...
14638    @AmericanAir you have my money, you change my ...
14639    @AmericanAir we have 8 ppl so we need 2 know h...
Name: text, Length: 14640, dtype: object

 Let's remove the non-alphanumeric characters i.e. special characters (like @, #, $ etc.) from the dataset using regex function 

In [12]:
import re # regex function

In [13]:
def remove_tags(string):
    result = re.sub('','',string)          #remove HTML tags 
    result = re.sub('https://.*','',result)   #remove URLs
    result = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", result) # remove special characters in result & replace them with a blank space
    result = result.lower()  # convert text to lowercase
    return result # after execution of return statement, interpreter will come out of the function and go to the location where the func. is called

data['text']=data['text'].apply(lambda cw : remove_tags(cw)) # apply func used to apply lambda func to a dataframe, remove_tags()func is called & cw will copy over string 

In [14]:
data['text']   # we can see now text column doesn't have special characters 

0                                             what   said 
1          plus you ve added commercials to the experie...
2          i didn t today    must mean i need to take a...
3          it s really aggressive to blast obnoxious  e...
4                 and it s a really big bad thing about it
                               ...                        
14635      thank you we got on a different flight to ch...
14636      leaving over 20 minutes late flight  no warn...
14637      please bring american airlines to  blackberry10
14638      you have my money  you change my flight  and...
14639      we have 8 ppl so we need 2 know how many sea...
Name: text, Length: 14640, dtype: object

Now, remove stop words . Stop words don't hold any special meaning in a sentence like 'and', 'the' etc. So, we should remove them using nltk library which has stop words list 

In [15]:
import nltk  # nltk lib. (Natural Language Toolkit)
nltk.download('stopwords') # download stopwords package to sublib. corpus 
from nltk.corpus import stopwords # import stopwords func. from sublib. corpus 
stop_words = set(stopwords.words('english')) # list of stop words in english 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nitin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
stop_words # list of stop words in english 

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [17]:
# remove stop words 
# join words in text column with a blank space if they are not in stopwords list 
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [18]:
data['text']  # now text column doesn't have stop words 

0                                                     said
1                  plus added commercials experience tacky
2                   today must mean need take another trip
3        really aggressive blast obnoxious entertainmen...
4                                     really big bad thing
                               ...                        
14635                   thank got different flight chicago
14637          please bring american airlines blackberry10
14638    money change flight answer phones suggestions ...
14639    8 ppl need 2 know many seats next flight plz p...
Name: text, Length: 14640, dtype: object

Now, we perform lemmatization on the text column. Lemmatization is used to find the root form of words in NLP, for ex: root form of the words: reading, reads, read is read. This save unnecessary computational cost in decoding the entire words. 

In lemmatization, text convert into tokens/words and then each token convert into root form 

In [19]:
nltk.download('wordnet') # download package wordnet to sublib. stem
nltk.download('omw-1.4') # download package omw-1.4 to sublib. tokenize 
w_tokenizer = nltk.tokenize.WhitespaceTokenizer() # save function in a variable 
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(sentence): # define a func and pass sentence in it 
    st = ""    # empty string 
    for w in w_tokenizer.tokenize(sentence):   # convert text into tokens using WhitespaceTokenizer() func saved in var. w_tokenizer 
        st = st + lemmatizer.lemmatize(w) + " "   # convert token into root form using WordNetLemmatizer() func saved in var. lemmatizer  
    return st
data['text'] = data['text'].apply(lemmatize_text) # func. is called, text copy over sentence 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nitin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nitin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [20]:
data['text']  # we can see text column has root form of words 

0                                                    said 
1                  plus added commercial experience tacky 
2                  today must mean need take another trip 
3        really aggressive blast obnoxious entertainmen...
4                                    really big bad thing 
                               ...                        
14635                  thank got different flight chicago 
14637          please bring american airline blackberry10 
14638    money change flight answer phone suggestion ma...
14639    8 ppl need 2 know many seat next flight plz pu...
Name: text, Length: 14640, dtype: object

In [21]:
data.head()

Unnamed: 0,sentiment,text
0,1,said
1,2,plus added commercial experience tacky
2,1,today must mean need take another trip
3,0,really aggressive blast obnoxious entertainmen...
4,0,really big bad thing


In [22]:
# define X and Y
X = data['text']
Y = data['sentiment']

In [23]:
X

0                                                    said 
1                  plus added commercial experience tacky 
2                  today must mean need take another trip 
3        really aggressive blast obnoxious entertainmen...
4                                    really big bad thing 
                               ...                        
14635                  thank got different flight chicago 
14637          please bring american airline blackberry10 
14638    money change flight answer phone suggestion ma...
14639    8 ppl need 2 know many seat next flight plz pu...
Name: text, Length: 14640, dtype: object

In [24]:
Y

0        1
1        2
2        1
3        0
4        0
        ..
14635    2
14636    0
14637    1
14638    0
14639    1
Name: sentiment, Length: 14640, dtype: int32

In [25]:
# split the dataset 
# 75% training set, 25% testing set 
# stratify = Y will make sure that random split has same proportion of 0's, 1's, 2's in both training(Y_train) & testing set(Y_test) 

from sklearn.model_selection import train_test_split 
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,stratify=Y, test_size=0.25,random_state=42) 


In [26]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)


(10980,)
(3660,)
(10980,)
(3660,)


In [27]:
#vectorize text to numbers using func. CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(stop_words='english')

In [28]:
vec

CountVectorizer(stop_words='english')

In [29]:
X_train = vec.fit_transform(X_train).toarray() # convert X_train into numbers 

In [30]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [31]:
X_test = vec.transform(X_test).toarray() # convert X_test into numbers 

In [32]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
# Now, we will fit the Naive Bayes model to the training data
from sklearn.naive_bayes import MultinomialNB   # multinomial for multiple classes, Gaussian for binary classes
classifier = MultinomialNB()

In [34]:
classifier.fit(X_train,Y_train)  # train the data

MultinomialNB()

In [35]:
classifier.score(X_train, Y_train) # score of the model on training data

0.8509107468123862

In [36]:
# we will predict the test data 
Y_pred = classifier.predict(X_test) # predict the class of Y for the given testing data

In [37]:
Y_pred

array([1, 0, 1, ..., 0, 0, 1])

In [38]:
print(list(zip(Y_test, Y_pred))) # compare actual Y with predicted Y

[(1, 1), (0, 0), (2, 1), (1, 0), (1, 2), (0, 0), (2, 1), (2, 2), (2, 2), (2, 0), (0, 0), (1, 0), (2, 2), (1, 1), (1, 0), (0, 0), (0, 0), (2, 2), (0, 0), (0, 0), (0, 0), (2, 2), (0, 0), (0, 0), (0, 0), (2, 0), (0, 0), (1, 1), (0, 0), (2, 2), (1, 0), (0, 0), (1, 1), (2, 2), (1, 0), (0, 0), (2, 0), (2, 0), (2, 2), (0, 0), (1, 1), (1, 0), (1, 0), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (1, 1), (0, 0), (2, 0), (0, 0), (1, 1), (0, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (0, 0), (2, 1), (1, 0), (0, 0), (0, 0), (0, 1), (2, 2), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (2, 2), (2, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 1), (2, 0), (1, 1), (1, 0), (0, 1), (0, 0), (0, 0), (2, 0), (0, 0), (1, 1), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (2, 1), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (0, 0), (1, 0), (1, 0), (0, 0), (1, 0), (0, 0), (0, 0), (0, 0), (1, 1), (1, 2), (0, 0), (0, 0), (0, 0), (0, 0), (1, 1), (1, 1), (0, 0), (0, 0), (1, 1), (0, 0), (1, 0), (1, 1), (0, 0),

In [39]:
# model evaluation 
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report # import these functions from metrics sublib.
cfm = confusion_matrix(Y_test,Y_pred)  #confusion matrix
print(cfm)

print('classification report')  # classification report
print(classification_report(Y_test,Y_pred))

acc = accuracy_score(Y_test,Y_pred)  # accuracy of the model
print('Multinomial Naive Bayes model accuracy:',acc)

[[2171   82   41]
 [ 428  287   60]
 [ 193   57  341]]
classification report
              precision    recall  f1-score   support

           0       0.78      0.95      0.85      2294
           1       0.67      0.37      0.48       775
           2       0.77      0.58      0.66       591

    accuracy                           0.76      3660
   macro avg       0.74      0.63      0.66      3660
weighted avg       0.75      0.76      0.74      3660

Multinomial Naive Bayes model accuracy: 0.7647540983606558


In [40]:
# model validating 
classifier.predict(vec.transform(['what would be amazingly awesome']).toarray())  # we can pass any text for testing our model 

array([2])

it means positive tweet which is also confirmed by our dataset. So, our model is doing correct prediction

In [41]:
classifier.predict(vec.transform(['really big bad thing about it']).toarray())  

array([0])

it means negative tweet, also confirmed by our dataset. So, model is performing good. 