In [30]:
# import necessary libraries for text preprocessing
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
# import necessary libraries for model building
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score

In [3]:
# Using the SMOTE technique for handling the imbalance of the classes in the predicting class
from imblearn.over_sampling import SMOTE

In [6]:
def read_data(path):
    ''' function to read the data'''
    data = pd.read_csv(path)
    return data
path = '/content/airline_sentiment_analysis.csv'
data = read_data(path)

In [7]:
data.head() #top 5 rows of the data sets

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [8]:
data.drop(columns = ['Unnamed: 0']) # dropping the columns which are not required
data['airline_sentiment'].value_counts()

negative    9178
positive    2363
Name: airline_sentiment, dtype: int64

In [10]:
def sperate_data(col):
    ''' taking the required text from the dataset'''
    text = []
    for i in col:
        text.append(i)
    return text
text = sperate_data(data['text'])

In [11]:
class preprocessing:
    def __init__(self,text):
        ''' initializing the text'''
        self.text = text
    def clean_data(self):
        ''' cleaning and preprocessing the data such as removing the symbols and etc'''
        corpus = []
        for i in range(len(self.text)):
            sent = re.sub('[^a-zA-Z]', ' ',self.text[i])
            sent = sent.lower()
            sent = word_tokenize(sent)
            lemmatize = WordNetLemmatizer()
            sent = [lemmatize.lemmatize(word) for word in sent if word not in stopwords.words('english')]
            sent = " ".join(sent)
            corpus.append(sent)
        return corpus

obj = preprocessing(text)
corpus = obj.clean_data()

In [32]:
class transform_text:
    def __init__(self,corpus):
        ''' initializing the corpus'''
        self.corpus = corpus
    def encode_y(self,col):
        '''function to convert the text into integer'''
        lb = LabelEncoder()
        y = lb.fit_transform(col)
        return y
    def count_vec(self):
        ''' using the bag of word model to xonvert entences into integer'''
        count_vet = CountVectorizer()
        x_countvec = count_vet.fit_transform(self.corpus)
        return x_countvec
    def tfidf_conv(self):
        ''' using the tfidf model to xonvert entences into integer'''
        tfidf_trans = TfidfVectorizer()
        x_tfidf = tfidf_trans.fit_transform(self.corpus)
        return x_tfidf
    def handel_imbalance(self,x_countvec,y):
        '''handling the imbalance into data'''
        oversample = SMOTE()
        x_new,y_new = oversample.fit_resample(x_countvec,y)
        return x_new,y_new       
    def handel_imbalance_tfidf(self,x_tfidf,y):
        '''handling the imbalance into data'''
        oversample = SMOTE()
        x_new_tfidf,y_new_tfidf = oversample.fit_resample(x_tfidf,y)
        return x_new_tfidf,y_new_tfidf


ob = transform_text(corpus)
y = ob.encode_y(data['airline_sentiment'])
x_countvec = ob.count_vec()
x_tfidf = ob.tfidf_conv()
x_new,y_new = ob.handel_imbalance(x_countvec,y)




In [33]:
# splitting the data
xtrain,xtest,ytrain,ytest = train_test_split(x_new,y_new,test_size = 0.2,random_state = 42)

In [34]:
# using the naive bayes algorithm to train our model
clf = MultinomialNB()
clf.fit(xtrain,ytrain)
y_pred = clf.predict(xtest) 

In [35]:
# accuracy_score(ytest,y_pred) accuracy of our algorithm

0.9354575163398693

In [36]:
# testing the data on custom sentence
test_text = ['VirginAmerica it really aggressive']

cv_test = CountVectorizer(vocabulary = count_vet.vocabulary_)
test_text = cv_test.transform(test_text)



In [37]:
# which predicted correctly 0 for positive and 1 for negative
clf.predict(test_text)

array([1])

In [38]:
print(classification_report(ytest,y_pred)) #breif about our classification  model

              precision    recall  f1-score   support

           0       0.94      0.93      0.94      1872
           1       0.93      0.94      0.93      1800

    accuracy                           0.94      3672
   macro avg       0.94      0.94      0.94      3672
weighted avg       0.94      0.94      0.94      3672



In [39]:
# testing with the support vector machine algorith for comparison
svm = SVC()
svm.fit(xtrain,ytrain)
ypred = svm.predict(xtest)

In [40]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.92      0.95      0.94      1872
           1       0.95      0.91      0.93      1800

    accuracy                           0.93      3672
   macro avg       0.94      0.93      0.93      3672
weighted avg       0.93      0.93      0.93      3672



In [41]:
# Looking r the accuracy it gaves the similar results
accuracy_score(ytest,ypred)

0.934368191721133

In [42]:
# saving the navies bayes model
filename = 'airline_model.sav'
pickle.dump(clf,open(filename,'wb'))

In [43]:
# testing if model is saved correctly
load_model = pickle.load(open('/content/airline_model.sav','rb'))
res = load_model.score(xtest,ytest)
print(res)

0.9354575163398693
